在Python中将字符串内核包装到scikit SVM分类器中

2024-06-10 19:24:35 发布

您现在位置:Python中文网/ 问答频道 /正文

我需要使用的不是一个库,而是一个硬编码字符串内核的文本分类,但我有一些问题,请启发我。 我从(https://github.com/helq/python-ssk)借用了SSK内核的pythonic变体(来自commit)

import numpy as np

# Kernel defined by Lodhi et al. (2002)
def ssk(s, t, n, lbda, accum=False):
    lens, lent = int(len(s)), int(len(t))
    #n = int(n)
    #k_prim = (-1)*np.ones( (n+1, lens, lent) )
    k_prim = np.zeros( (n, lens, lent) )
    indices = { x : [i for i, e in enumerate(t) if e == x] for x in set(s) }

    k_prim[0,:,:] = 1

    for i in range(1,n):
        for sj in range(i,lens):
            toret = 0.
            for tk in range(i,lent):
                if s[sj-1]==t[tk-1]: # trick taken from shogun implemantion of SSK
                    toret = lbda * (toret + lbda*k_prim[i-1,sj-1,tk-1])
                else:
                    toret *= lbda
                k_prim[i,sj,tk] = toret + lbda * k_prim[i, sj-1, tk]


    start = 0 if accum else n-1
    k = 0.
    for i in range(n):
        for sj in range(i,lens):
            for tk in range(i,lent):
                if s[sj]==t[tk]:
                    k += lbda*lbda*k_prim[i,sj,tk]

    # print( [len(list(i for (sj,tk,i) in k_prim if i==m-1)) for m in range(n)] )
    return k

def string_kernel(xs, ys, n, lbda):
    if len(xs.shape) != 2 or len(ys.shape) != 2 or xs.shape[1] != 1 or ys.shape[1] != 1:
        raise "The shape of the features is wrong, it must be (n,1)"

    lenxs, lenys = xs.shape[0], ys.shape[0]

    mat = np.zeros( (lenxs, lenys) )
    for i in range(lenxs):
        for j in range(lenys):
            mat[i,j] = ssk(xs[i,0], ys[j,0], n, lbda, accum=True)

    mat_xs = np.zeros( (lenxs, 1) )
    mat_ys = np.zeros( (lenys, 1) )

    for i in range(lenxs):
        mat_xs[i] = ssk(xs[i,0], xs[i,0], n, lbda, accum=True)
    for j in range(lenys):
        mat_ys[j] = ssk(ys[j,0], ys[j,0], n, lbda, accum=True)

    return np.divide(mat, np.sqrt(mat_ys.T * mat_xs))

我试过他提议的包装纸:

def get_ssk_kernel_for_scikit(max_substring, lambda_decay):
    def strker(il,ir):
        #print("Shape of gramm matrix to create ({},{})".format(len(il), len(ir)))
        # assuming that il and ir are lists of strings.
        # len(il) may fail to give you the size real size if you're using np.arrays
        # the idea is to reshape your data to be np.arrays of shapes (n,1) and (m,1)
        l = np.array(il).reshape( (len(il), 1) )
        r = np.array(ir).reshape( (len(ir), 1) )
        return string_kernel(l, r, max_substring, lambda_decay)
    return strker

lambda_decay = 1
max_substring = 2

my_ssk_kernel = get_ssk_kernel_for_scikit(max_substring, lambda_decay)

然后,我使用scikit执行SVM:

xs = ["man", "woman", "women", "men"] 
ys = [0, 1, 1, 0] 

clf = svm.SVC(kernel = my_ssk_kernel)
clf.fit(xs, ys)

我得到一个错误:

ValueError: could not convert string to float: 'man'

我已经找了好几个小时了,但没有找到任何解决办法。关于这个问题(How to use string kernels in scikit-learn?),给出的例子对我不起作用。如果你能帮助我,我将非常感激


Tags: inforlenifnprangekerneltk