语音转文本 - 将发言人标签映射到相应的JSON响应中的转录文本 - 问答

{ "results": [ { "alternatives": [ { "timestamps": [ [ "the", 6.18, 6.63 ], [ "weather", 6.63, 6.95 ], [ "is", 6.95, 7.53 ], [ "sunny", 7.73, 8.11 ], [ "it's", 8.21, 8.5 ], [ "time", 8.5, 8.66 ], [ "to", 8.66, 8.81 ], [ "sip", 8.81, 8.99 ], [ "in", 8.99, 9.02 ], [ "some", 9.02, 9.25 ], [ "cold", 9.25, 9.32 ], [ "beer", 9.32, 9.68 ] ], "confidence": 0.812, "transcript": "the weather is sunny it's time to sip in some cold beer " } ], "final": "True" }, { "alternatives": [ { "timestamps": [ [ "sure", 10.52, 10.88 ], [ "that", 10.92, 11.19 ], [ "sounds", 11.68, 11.82 ], [ "like", 11.82, 12.11 ], [ "a", 12.32, 12.96 ], [ "plan", 12.99, 13.8 ] ], "confidence": 0.829, "transcript": "sure that sounds like a plan" } ], "final": "True" } ], "result_index":0, "speaker_labels": [ { "from": 6.18, "to": 6.63, "speaker": 0, "confidence": 0.475, "final": "False" }, { "from": 6.63, "to": 6.95, "speaker": 0, "confidence": 0.475, "final": "False" }, { "from": 6.95, "to": 7.53, "speaker": 0, "confidence": 0.475, "final": "False" }, { "from": 7.73, "to": 8.11, "speaker": 0, "confidence": 0.499, "final": "False" }, { "from": 8.21, "to": 8.5, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 8.5, "to": 8.66, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 8.66, "to": 8.81, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 8.81, "to": 8.99, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 8.99, "to": 9.02, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 9.02, "to": 9.25, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 9.25, "to": 9.32, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 9.32, "to": 9.68, "speaker": 0, "confidence": 0.472, "final": "False" }, { "from": 10.52, "to": 10.88, "speaker": 2, "confidence": 0.441, "final": "False" }, { "from": 10.92, "to": 11.19, "speaker": 2, "confidence": 0.364, "final": "False" }, { "from": 11.68, "to": 11.82, "speaker": 2, "confidence": 0.372, "final": "False" }, { "from": 11.82, "to": 12.11, "speaker": 2, "confidence": 0.372, "final": "False" }, { "from": 12.32, "to": 12.96, "speaker": 2, "confidence": 0.383, "final": "False" }, { "from": 12.99, "to": 13.8, "speaker": 2, "confidence": 0.428, "final": "False" } ] }

import json # with open('C:\\Users\\%USERPROFILE%\\Desktop\\example.json', 'r') as f: # data = json.load(f) l1 = [] l2 = [] l3 = [] for i in data['results']: for j in i['alternatives'][0]['timestamps']: l1.append(j) for m in data['speaker_labels']: l2.append(m) for q in l1: for n in l2: if q[1]==n['from']: l3.append((q[0],n['speaker'], q[1], q[2])) print(l3)

[('the', 0, 6.18, 6.63), ('weather', 0, 6.63, 6.95), ('is', 0, 6.95, 7.53), ('sunny', 0, 7.73, 8.11), ("it's", 0, 8.21, 8.5), ('time', 0, 8.5, 8.66), ('to', 0, 8.66, 8.81), ('sip', 0, 8.81, 8.99), ('in', 0, 8.99, 9.02), ('some', 0, 9.02, 9.25), ('cold', 0, 9.25, 9.32), ('beer', 0, 9.32, 9.68), ('sure', 2, 10.52, 10.88), ('that', 2, 10.92, 11.19), ('sounds', 2, 11.68, 11.82), ('like', 2, 11.82, 12.11), ('a', 2, 12.32, 12.96), ('plan', 2, 12.99, 13.8)]

3条回答

网友

1楼 · 编辑于 2024-04-25 10:03:58

我试过用JS做什么看看这是否与使用python类似

var resultTimestampLen = 0;

arrLen = JSON.parse(sTot_resuts.results.length);
for(var i = 0; i<arrLen; i++){

    speakerLablefrom = sTot_resuts.speaker_labels[resultTimestampLen].from;

    speakerLabelto = sTot_resuts.speaker_labels[resultTimestampLen].to;

    speakerId = sTot_resuts.speaker_labels[resultTimestampLen].speaker;


    var findSpeaker = new Array();
    findSpeaker = sTot_resuts.results[i].alternatives[0].timestamps[0];

    var timeStampFrom = findSpeaker[1];

    var timeStampto = findSpeaker[2];


      if(timeStampFrom === speakerLablefrom && timeStampto === speakerLabelto){
        console.log('Speaker '+sTot_resuts.speaker_labels[resultTimestampLen].speaker + ' ' + sTot_resuts.results[i].alternatives[0].transcript);
        var resultsTimestamp = new Array();
        resultsTimestamp = sTot_resuts.results[i].alternatives[0].timestamps.length;

        resultTimestampLen = resultsTimestamp+resultTimestampLen;
      }else{
        console.log('resultTimestampLen '+resultTimestampLen + 'speakerLablefrom '+speakerLablefrom + 'speakerLabelto '+speakerLabelto + 'timeStampFrom '+timeStampFrom + 'timeStampto '+timeStampto);
      }
}

网友

2楼 · 编辑于 2024-04-25 10:03:58

我根据单词的时间戳将单词放入dict中，然后他们将单词与说话者匹配：

times = {}
for r in data['results']:
    for word in r['alternatives'][0]['timestamps']:
        times[(word[1], word[2])] = word[0]

transcripts = {}
for r in data['speaker_labels']:
    speaker = r['speaker']
    if speaker in transcripts:
        transcripts[speaker].append(times[(r['from'], r['to'])])
    else:
        transcripts[speaker] = [times[(r['from'], r['to'])]]

print([{'speaker': k, 'transcript': ' '.join(transcripts[k])} for k in transcripts])

它在提供的示例上运行了1000000次，时间大约为12.34秒，因此希望它足够快以满足您的需要。在

网友

3楼 · 编辑于 2024-04-25 10:03:58

利用熊猫，我刚才是怎么解决的。在

假设数据存储在名为data的字典中

import pandas as pd

labels = pd.DataFrame.from_records(data['speaker_labels'])

transcript_tstamps = pd.DataFrame.from_records(
    [t for r in data['results'] 
       for a in r['alternatives'] 
       for t in a['timestamps']], 
    columns=['word', 'from', 'to']
)
# this list comprehension more-efficiently de-nests the dictionary into
# records that can be used to create a DataFrame

df = labels.merge(transcript_tstamps)
# produces a dataframe of speakers to words based on timestamps from & to
# since I knew I wanted to merge on the from & to columns, 
# I named the columns thus when I created the transcript_tstamps data frame
# like this:
    confidence  final   from  speaker     to     word
0        0.475  False   6.18        0   6.63      the
1        0.475  False   6.63        0   6.95  weather
2        0.475  False   6.95        0   7.53       is
3        0.499  False   7.73        0   8.11    sunny
4        0.472  False   8.21        0   8.50     it's
5        0.472  False   8.50        0   8.66     time
6        0.472  False   8.66        0   8.81       to
7        0.472  False   8.81        0   8.99      sip
8        0.472  False   8.99        0   9.02       in
9        0.472  False   9.02        0   9.25     some
10       0.472  False   9.25        0   9.32     cold
11       0.472  False   9.32        0   9.68     beer
12       0.441  False  10.52        2  10.88     sure
13       0.364  False  10.92        2  11.19     that
14       0.372  False  11.68        2  11.82   sounds
15       0.372  False  11.82        2  12.11     like
16       0.383  False  12.32        2  12.96        a
17       0.428  False  12.99        2  13.80     plan

在连接说话人和单词数据后，需要将同一说话人的连续单词组合在一起，以导出当前说话人。例如，如果扬声器数组看起来像[2,2,2,2,0,0,0,2,2,2,0,0,0]，我们需要将前四个2组合在一起，然后将第三个0组合在一起，然后将三个2和剩余的{}组合在一起。在

按['from', 'to']对数据进行排序，然后为此设置一个称为current_speaker的伪变量，如下所示：

^{pr2}$

从这里开始，按current_speaker分组，将单词聚合成一个句子&转换为json。有一点额外的重命名来修复输出json键

transcripts = df.groupby('current_speaker').agg({
   'word': lambda x: ' '.join(x),
   'speaker': min
}).rename(columns={'word': 'transcript'})
transcripts[['speaker', 'transcript']].to_json(orient='records')
# produces the following output (indentation added by me for legibility):
'[{"speaker":0,
  "transcript":"the weather is sunny it\'s time to sip in some cold beer"},    
 {"speaker":2,
  "transcript":"sure that sounds like a plan"}]'

要在脚本开始/结束时添加其他数据，可以将from/To的最小值/最大值添加到groupby中

transcripts = df.groupby('current_speaker').agg({
   'word': lambda x: ' '.join(x),
   'speaker': min,
   'from': min,
   'to': max
}).rename(columns={'word': 'transcript'})

另外，（虽然这不适用于这个示例数据集），您可能应该为每个时间片选择置信度最高的替代方案。

语音转文本 - 将发言人标签映射到相应的JSON响应中的转录文本

相关问题更多 >

编程相关推荐

热门问题

热门文章

语音转文本 - 将发言人标签映射到相应的JSON响应中的转录文本

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >