字符串列表到字符串的对齐索引

2024-03-28 14:10:57 发布

您现在位置:Python中文网/ 问答频道 /正文

我需要一个函数来给出一个字符串列表与一个更大的字符串最匹配的索引。在

例如:

给定字符串:

text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'

以及字符串列表:

^{pr2}$

是否可以创建函数来生成:

indices = [7, 10, 12, 32, 42, 49, 51, 67, 70, 77, 80, 87, 88, 97, 105]


下面是我创建的一个脚本来说明这一点:

from re import split
from numpy import vstack, zeros
import numpy as np

# I need a function which takes a string and the tokenized list 
# and returns the indices for which the tokens were split at
def index_of_split(text_str, list_of_strings):
    #?????
    return indices

# The text string, string token list, and character binary annotations 
# are all given
text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'
tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']
# (This binary array labels the following terms ['Kir4.3', 'Dextran-sulfate', 'glucose'])
bin_ann = [1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

# Here we would apply our function
indices = index_of_split(text, tok)
# This list is the desired output
#indices = [7, 10, 12, 32, 42, 49, 51, 67, 70, 77, 80, 87, 88, 97, 105]

# We could now split the binary array based on these indices
bin_ann_toked = np.split(bin_ann, indices)
# and combine with the tokenized list
tokenized_strings = np.vstack((tok, bin_ann_toked)).T

# Then we can remove the trailing zeros, 
# which are likely caused from spaces, 
# or other non tokenized text
for i, el in enumerate(tokenized_strings):
    tokenized_strings[i][1] = el[1][:len(el[0])]
print(tokenized_strings)

如果函数按所述工作,则该将提供以下输出:

[['Kir4.3' array([1, 1, 1, 1, 1, 1])]
 ['is' array([0, 0])]
 ['a' array([0])]
 ['inwardly-rectifying'
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
 ['potassium' array([0, 0, 0, 0, 0, 0, 0, 0, 0])]
 ['channel' array([0, 0, 0, 0, 0, 0, 0])]
 ['.' array([0])]
 ['Dextran-sulfate' array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]
 ['is' array([0, 0])]
 ['useful' array([0, 0, 0, 0, 0, 0])]
 ['in' array([0, 0])]
 ['glucose' array([1, 1, 1, 1, 1, 1, 1])]
 ['-' array([0])]
 ['mediated' array([0, 0, 0, 0, 0, 0, 0, 0])]
 ['channels' array([0, 0, 0, 0, 0, 0, 0, 0])]
 ['.' array([0])]]

Tags: the字符串textinisarraylistsplit
2条回答

这里有一个蛮力numpy方法:它查找所有单词匹配,然后对所有组合进行评分,以惩罚偏移量。在

import numpy as np
from scipy import signal

def pen(l, r):
    return (r-l)*(1-4*(l>r))

class template:
    def __init__(self, template):
        self.template = np.frombuffer(template.encode('utf32'), offset=4,
                                      dtype=np.int32)
        self.normalise = self.template*self.template
    def match(self, other):
        other = np.frombuffer(other.encode('utf32'), offset=4, dtype=np.int32)[::-1]
        m = signal.convolve(self.template, other, 'valid')
        t = signal.convolve(self.normalise, np.ones_like(other), 'valid')
        delta = np.absolute(m - t)
        md = min(delta)
        return np.where(delta == md)[0], md
    def brute(self, tok):
        ms, md = self.match(tok[0])
        matches = [[-md, (tok[0], s, s+len(tok[0]))] for s in ms]
        for t in tok[1:]:
            ms, md = self.match(t)
            matches = [[mo[0] - md - pen(mo[-1][-1], mn)] + mo[1:]
                       + [(t, mn, mn + len(t))] for mn in ms for mo in matches]
        return sorted(matches, key=lambda x: x[0])
#            for t in tok[1:]:
#                ms, md = self.match(t)
#                matches = [[mo[0] - md] + mo[1:]
#                           + [(t, mn, mn + len(t))] for mn in ms for mo in matches
#                           if mo[-1][-1] <=  mn]
#            return sorted(matches, key=lambda x: x[0])

text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'
tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']
tx = template(text)
matches = tx.brute(tok)
print(matches[-1])

# [-11, ('Kir4.3', 0, 6), ('is', 7, 9), ('a', 10, 11), ('inwardly-rectifying', 12, 31), ('potassium', 32, 41), ('channel', 42, 49), ('.', 49, 50), ('Dextran-sulfate', 51, 66), ('is', 67, 69), ('useful', 70, 76), ('in', 77, 79), ('glucose', 80, 87), ('-', 87, 88), ('mediated', 88, 96), ('channels', 97, 105), ('.', 105, 106)]
text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'

tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']


ind = [0]
for i,substring in enumerate(tok):
    ind.append(text.find(substring,ind[i],len(text)))

print ind[2:]

结果

^{pr2}$

相关问题 更多 >