我得到了微软语音识别实验室的虚拟代码。 我试图找出单词错误率(个别以及总和)的所有句子存储在文件中。你知道吗
我已经使用Numpy数组将文件加载到内存中,现在我正在努力找到文件中每个句子的错误率。总共有三个句子,我希望我的程序遍历每个句子,并计算单词错误率。我的循环运行了三次,但结果只在第一句话中累积。看看我的代码,告诉我哪里出错了。谢谢。你知道吗
提供的代码:
def string_edit_distance(ref="ref_data", hyp="hyp_data"):
if ref is None or hyp is None:
RuntimeError("ref and hyp are required, cannot be None")
x = ref
y = hyp
tokens = len(x)
if (len(hyp)==0):
return (tokens, tokens, tokens, 0, 0)
# p[ix,iy] consumed ix tokens from x, iy tokens from y
p = np.PINF * np.ones((len(x) + 1, len(y) + 1)) # track total errors
e = np.zeros((len(x)+1, len(y) + 1, 3), dtype=np.int) # track deletions, insertions, substitutions
p[0] = 0
for ix in range(len(x) + 1):
for iy in range(len(y) + 1):
cst = np.PINF*np.ones([3])
s = 0
if ix > 0:
cst[0] = p[ix - 1, iy] + 1 # deletion cost
if iy > 0:
cst[1] = p[ix, iy - 1] + 1 # insertion cost
if ix > 0 and iy > 0:
s = (1 if x[ix - 1] != y[iy -1] else 0)
cst[2] = p[ix - 1, iy - 1] + s # substitution cost
if ix > 0 or iy > 0:
idx = np.argmin(cst) # if tied, one that occurs first wins
p[ix, iy] = cst[idx]
if (idx==0): # deletion
e[ix, iy, :] = e[ix - 1, iy, :]
e[ix, iy, 0] += 1
elif (idx==1): # insertion
e[ix, iy, :] = e[ix, iy - 1, :]
e[ix, iy, 1] += 1
elif (idx==2): # substitution
e[ix, iy, :] = e[ix - 1, iy - 1, :]
e[ix, iy, 2] += s
edits = int(p[-1,-1])
deletions, insertions, substitutions = e[-1, -1, :]
我迄今为止的尝试:
with open("misc/hyp.trn") as f:
hyp_data = f.readlines()
with open("misc/ref.trn") as f:
ref_data = f.readlines()
hypData = []
refData = []
for lines in hyp_data:
hypData.append(lines[:][:-20])
for line in ref_data:
refData.append(line[:][:-20])
for i in range(len(hypData)):
print("Line Number: ",i, refData[i], hypData[i])
print("Total number of reference sentences in the test set: ", len(refData))
print("Number of sentences with an error", len(hypData))
print("Total number of reference words", tokens)
print("Total number of word substitutions, insertions, and deletions: ")
print("----------------------------------------------------------------")
print("Scores: N="+str(tokens)+", S="+str(substitutions)+", D= "+str(deletions)+",
I="+str(insertions))
print("The percentage of total errors (WER) and percentage of substitutions, insertions, and
deletions")
wer = (deletions+insertions+substitutions)/tokens
print("The percentage of total errors (WER): ", int((wer*100)*10 + 0.5)/10)
print("Percentage of substitutions: ", int((substitutions*100 + 0.5)/10))
print("Percentage of insertions: ", int((insertions*100 + 0.5)/10))
print("Percentage of deletions: ",int((deletions*100 + 0.5)/10))
string_edit_distance()
目前没有回答
相关问题 更多 >
编程相关推荐