我需要使用的不是一个库,而是一个硬编码字符串内核的文本分类,但我有一些问题,请启发我。 我从(https://github.com/helq/python-ssk)借用了SSK内核的pythonic变体(来自commit)
import numpy as np
# Kernel defined by Lodhi et al. (2002)
def ssk(s, t, n, lbda, accum=False):
lens, lent = int(len(s)), int(len(t))
#n = int(n)
#k_prim = (-1)*np.ones( (n+1, lens, lent) )
k_prim = np.zeros( (n, lens, lent) )
indices = { x : [i for i, e in enumerate(t) if e == x] for x in set(s) }
k_prim[0,:,:] = 1
for i in range(1,n):
for sj in range(i,lens):
toret = 0.
for tk in range(i,lent):
if s[sj-1]==t[tk-1]: # trick taken from shogun implemantion of SSK
toret = lbda * (toret + lbda*k_prim[i-1,sj-1,tk-1])
else:
toret *= lbda
k_prim[i,sj,tk] = toret + lbda * k_prim[i, sj-1, tk]
start = 0 if accum else n-1
k = 0.
for i in range(n):
for sj in range(i,lens):
for tk in range(i,lent):
if s[sj]==t[tk]:
k += lbda*lbda*k_prim[i,sj,tk]
# print( [len(list(i for (sj,tk,i) in k_prim if i==m-1)) for m in range(n)] )
return k
def string_kernel(xs, ys, n, lbda):
if len(xs.shape) != 2 or len(ys.shape) != 2 or xs.shape[1] != 1 or ys.shape[1] != 1:
raise "The shape of the features is wrong, it must be (n,1)"
lenxs, lenys = xs.shape[0], ys.shape[0]
mat = np.zeros( (lenxs, lenys) )
for i in range(lenxs):
for j in range(lenys):
mat[i,j] = ssk(xs[i,0], ys[j,0], n, lbda, accum=True)
mat_xs = np.zeros( (lenxs, 1) )
mat_ys = np.zeros( (lenys, 1) )
for i in range(lenxs):
mat_xs[i] = ssk(xs[i,0], xs[i,0], n, lbda, accum=True)
for j in range(lenys):
mat_ys[j] = ssk(ys[j,0], ys[j,0], n, lbda, accum=True)
return np.divide(mat, np.sqrt(mat_ys.T * mat_xs))
我试过他提议的包装纸:
def get_ssk_kernel_for_scikit(max_substring, lambda_decay):
def strker(il,ir):
#print("Shape of gramm matrix to create ({},{})".format(len(il), len(ir)))
# assuming that il and ir are lists of strings.
# len(il) may fail to give you the size real size if you're using np.arrays
# the idea is to reshape your data to be np.arrays of shapes (n,1) and (m,1)
l = np.array(il).reshape( (len(il), 1) )
r = np.array(ir).reshape( (len(ir), 1) )
return string_kernel(l, r, max_substring, lambda_decay)
return strker
lambda_decay = 1
max_substring = 2
my_ssk_kernel = get_ssk_kernel_for_scikit(max_substring, lambda_decay)
然后,我使用scikit执行SVM:
xs = ["man", "woman", "women", "men"]
ys = [0, 1, 1, 0]
clf = svm.SVC(kernel = my_ssk_kernel)
clf.fit(xs, ys)
我得到一个错误:
ValueError: could not convert string to float: 'man'
我已经找了好几个小时了,但没有找到任何解决办法。关于这个问题(How to use string kernels in scikit-learn?),给出的例子对我不起作用。如果你能帮助我,我将非常感激
目前没有回答
相关问题 更多 >
编程相关推荐