from string import punctuation
RETAIN_PUNCTUATION_FLAG = False
RETAIN_CASE_FLAG = False
string = "DT The NN dog VB jumps DT the NN sofa. DT The NN cat VB pages DT the NN page."
punctuation_strip_table = str.maketrans('', '', punctuation)
if RETAIN_CASE_FLAG and RETAIN_PUNCTUATION_FLAG:
pass
elif RETAIN_CASE_FLAG and not RETAIN_PUNCTUATION_FLAG:
string = string.translate(punctuation_strip_table)
elif not RETAIN_CASE_FLAG and RETAIN_PUNCTUATION_FLAG:
string = string.casefold()
elif not RETAIN_CASE_FLAG and not RETAIN_PUNCTUATION_FLAG:
string = string.casefold().translate(punctuation_strip_table)
list_all = string.split(' ')
pos_word_pairs = set(zip(
list_all[0:][::2],
list_all[1:][::2]))
pos_list = {pos.upper(): {
'count': list_all.count(pos),
'words': [
word
for match_pos, word in pos_word_pairs
if match_pos == pos]
}
for pos in set(list_all[0:][::2])}
word_list = {word: {
'count': list_all.count(word),
'pos': [
pos.upper()
for pos, match_word in pos_word_pairs
if match_word == word]
}
for word in set(list_all[1:][::2])}
paired = {
word: {
pos.upper():
list_all.count(pos)}
for pos, word
in pos_word_pairs}
print('pos_list:', pos_list)
print()
print('word_list:', word_list)
print()
print('paired:',paired)
这种方法应该为您提供所要查找的结构,
POS
计数是所呈现语料库中该标记的完整计数。你知道吗注意:使用
RETAIN_PUNCTUATION_FLAG
和RETAIN_CASE_FLAG
可以切换行为,在求值前去掉标点符号,使大小写统一,或保留上/下大小写,或者两者都做。在这里,它们都被赋值为False
,所有单词都将被处理为小写,所有ASCII
标点符号都将在求值之前被去除。你知道吗我添加了
word_list
和pos_list
作为备选列表。你知道吗输出:
相关问题 更多 >
编程相关推荐