如何将python/cythonunicode字符串转换为长整数数组，以进行levenshtein编辑distan

#--------------------------------------------------------------------------- cdef extern from "stdlib.h": ctypedef unsigned int size_t size_t strlen(char *s) void *malloc(size_t size) void *calloc(size_t n, size_t size) void free(void *ptr) int strcmp(char *a, char *b) char * strcpy(char *a, char *b) #--------------------------------------------------------------------------- cdef extern from "Python.h": object PyTuple_GET_ITEM(object, int) void Py_INCREF(object) #--------------------------------------------------------------------------- cdef inline size_t imin(int a, int b, int c): if a < b: if c < a: return c return a if c < b: return c return b #--------------------------------------------------------------------------- cpdef int editdistance( char *a, char *b ): """Given two byte strings ``a`` and ``b``, return their absolute Damerau- Levenshtein distance. Each deletion, insertion, substitution, and transposition is counted as one difference, so the edit distance between ``abc`` and ``ab``, ``abcx``, ``abx``, ``acb``, respectively, is ``1``.""" #......................................................................... if strcmp( a, b ) == 0: return 0 #......................................................................... cdef int alen = strlen( a ) cdef int blen = strlen( b ) cdef int R cdef char *ctmp cdef size_t i cdef size_t j cdef size_t achr cdef size_t bchr #......................................................................... if alen > blen: ctmp = a; a = b; b = ctmp; alen, blen = blen, alen #......................................................................... cdef char *m1 = <char *>calloc( blen + 2, sizeof( char ) ) cdef char *m2 = <char *>calloc( blen + 2, sizeof( char ) ) cdef char *m3 = <char *>malloc( ( blen + 2 ) * sizeof( char ) ) #......................................................................... for i from 0 <= i <= blen: m2[ i ] = i #......................................................................... for i from 1 <= i <= alen: m1[ 0 ] = i + 1 achr = a[ i - 1 ] for j from 1 <= j <= blen: bchr = b[ j- 1 ] if achr == bchr: m1[ j ] = m2[ j - 1 ] else: m1[ j ] = 1 + imin( m1[ j - 1 ], m2[ j - 1 ], m2[ j ] ) if i != 1 and j != 1 and achr == b[ j - 2 ] and bchr == a[ i - 2 ]: m1[ j ] = m3[ j - 1 ] #....................................................................... m1, m2 = m2, m1 strcpy( m3, m2 ) #......................................................................... R = <int>m2[ blen ] #......................................................................... # cleanup: free( m3 ) free( m1 ) free( m2 ) #......................................................................... return R

3条回答

网友

1楼 · 编辑于 2024-04-25 01:58:49

警告莱克特：我从来没有这样做过。下面是我将尝试的一个粗略的草图。在

您将需要使用PyUnicode_AsUnicode函数和下一个函数PyUnicode\u GetSize。在声明中，当前有char，请改用Py_UNICODE。假设使用窄（UCS2）构建，您将复制内部结构，并在执行过程中转换代理项对。对于宽（UCS4）构建，您可以直接对内部结构进行操作。在

网友

2楼 · 编辑于 2024-04-25 01:58:49

我结束这个问题是因为我找到了一个更好的算法。。。with its own problems.那边见。在

网友

3楼 · 编辑于 2024-04-25 01:58:49

使用ord()将字符转换为其整数代码点。它使用unicode或str字符串类型中的字符：

codepoints = [ord(c) for c in text]

相关问题更多 >

编程相关推荐

热门问题

热门文章