我有以下问题:
在英语中,我的代码使用Gensim生成成功的单词嵌入,考虑到余弦距离,相似短语彼此接近:
“反应时间与误差测量”和“用户感知反应时间与误差测量的关系”之间的夹角很小,因此它们是集合中最相似的短语。在
然而,当我在葡萄牙语中使用相同的短语时,它并不起作用:
我的代码如下:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import matplotlib.pyplot as plt
from gensim import corpora
documents = ["Interface máquina humana para aplicações computacionais de laboratório abc",
"Um levantamento da opinião do usuário sobre o tempo de resposta do sistema informático",
"O sistema de gerenciamento de interface do usuário EPS",
"Sistema e testes de engenharia de sistemas humanos de EPS",
"Relação do tempo de resposta percebido pelo usuário para a medição de erro",
"A geração de árvores não ordenadas binárias aleatórias",
"O gráfico de interseção dos caminhos nas árvores",
"Gráfico de menores IV Largura de árvores e bem quase encomendado",
"Gráficos menores Uma pesquisa"]
stoplist = set('for a of the and to in on'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
texts
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
frequency
from nltk import tokenize
texts=[tokenize.word_tokenize(documents[i], language='portuguese') for i in range(0,len(documents))]
from pprint import pprint
pprint(texts)
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print(dictionary)
print(dictionary.token2id)
# VECTOR
new_doc = "Tempo de resposta e medição de erro"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)
## VETOR OF PHRASES
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)
print(corpus)
from gensim import corpora, models, similarities
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
### PHRASE COORDINATES
frase=tfidf[new_vec]
print(frase)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(2)
## TEXT COORDINATES
todas=[]
for doc in corpus_lsi:
todas.append(doc)
todas
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
print(corpus)
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = new_doc
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi)
p=[]
for i in range(0,len(documents)):
doc1 = documents[i]
vec_bow2 = dictionary.doc2bow(doc1.lower().split())
vec_lsi2 = lsi[vec_bow2]
p.append(vec_lsi2)
p
index = similarities.MatrixSimilarity(lsi[corpus])
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
sims = index[vec_lsi]
print(list(enumerate(sims)))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims)
#################
import gensim
import numpy as np
import matplotlib.colors as colors
import matplotlib.cm as cmx
import matplotlib as mpl
matrix1 = gensim.matutils.corpus2dense(p, num_terms=2)
matrix3=matrix1.T
matrix3[0]
ss=[]
for i in range(0,9):
ss.append(np.insert(matrix3[i],0,[0,0]))
matrix4=ss
matrix4
matrix2 = gensim.matutils.corpus2dense([vec_lsi], num_terms=2)
matrix2=np.insert(matrix2,0,[0,0])
matrix2
DATA=np.insert(matrix4,0,matrix2)
DATA=DATA.reshape(10,4)
DATA
names=np.array(documents)
names=np.insert(names,0,new_doc)
new_doc
cmap = plt.cm.jet
cNorm = colors.Normalize(vmin=np.min(DATA[:,3])+.2, vmax=np.max(DATA[:,3]))
scalarMap = cmx.ScalarMappable(norm=cNorm,cmap=cmap)
len(DATA[:,1])
plt.subplots()
plt.figure(figsize=(12,9))
plt.scatter(matrix1[0],matrix1[1],s=60)
plt.scatter(matrix2[2],matrix2[3],color='r',s=95)
for idx in range(0,len(DATA[:,1])):
colorVal = scalarMap.to_rgba(DATA[idx,3])
plt.arrow(DATA[idx,0],
DATA[idx,1],
DATA[idx,2],
DATA[idx,3],
color=colorVal,head_width=0.002, head_length=0.001)
for i,names in enumerate (names):
plt.annotate(names, (DATA[i][2],DATA[i][3]),va='top')
plt.title("PHRASE SIMILARITY - WORD2VEC with GENSIM library")
plt.xlim(min(DATA[:,2]-.2),max(DATA[:,2]+1))
plt.ylim(min(DATA[:,3]-.2),max(DATA[:,3]+.3))
plt.show()
我的问题是:Gensim有没有额外的设置来生成葡萄牙语中正确的单词嵌入,或者Gensim不支持这种语言?在
一年零十个月后,我自己得到的答复是:在PyTorch中使用BERT嵌入:
短语:
我改编了Pythorch精华_功能.py在https://github.com/ethanjperez/pytorch-pretrained-BERT/blob/master/examples/extract_features.py
然后运行:
^{pr2}$正在分析JSON文件:
作为输出获取:
相关问题 更多 >
编程相关推荐