<p>这里有一个蛮力numpy方法:它查找所有单词匹配,然后对所有组合进行评分,以惩罚偏移量。在</p>
<pre><code>import numpy as np
from scipy import signal
def pen(l, r):
return (r-l)*(1-4*(l>r))
class template:
def __init__(self, template):
self.template = np.frombuffer(template.encode('utf32'), offset=4,
dtype=np.int32)
self.normalise = self.template*self.template
def match(self, other):
other = np.frombuffer(other.encode('utf32'), offset=4, dtype=np.int32)[::-1]
m = signal.convolve(self.template, other, 'valid')
t = signal.convolve(self.normalise, np.ones_like(other), 'valid')
delta = np.absolute(m - t)
md = min(delta)
return np.where(delta == md)[0], md
def brute(self, tok):
ms, md = self.match(tok[0])
matches = [[-md, (tok[0], s, s+len(tok[0]))] for s in ms]
for t in tok[1:]:
ms, md = self.match(t)
matches = [[mo[0] - md - pen(mo[-1][-1], mn)] + mo[1:]
+ [(t, mn, mn + len(t))] for mn in ms for mo in matches]
return sorted(matches, key=lambda x: x[0])
# for t in tok[1:]:
# ms, md = self.match(t)
# matches = [[mo[0] - md] + mo[1:]
# + [(t, mn, mn + len(t))] for mn in ms for mo in matches
# if mo[-1][-1] <= mn]
# return sorted(matches, key=lambda x: x[0])
text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'
tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']
tx = template(text)
matches = tx.brute(tok)
print(matches[-1])
# [-11, ('Kir4.3', 0, 6), ('is', 7, 9), ('a', 10, 11), ('inwardly-rectifying', 12, 31), ('potassium', 32, 41), ('channel', 42, 49), ('.', 49, 50), ('Dextran-sulfate', 51, 66), ('is', 67, 69), ('useful', 70, 76), ('in', 77, 79), ('glucose', 80, 87), ('-', 87, 88), ('mediated', 88, 96), ('channels', 97, 105), ('.', 105, 106)]
</code></pre>