import os
import gensim.models as g
import logging
import gensim
os.chdir("/home/ai/path");
#doc2vec parameters
vector_size = 300
window_size = 5
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm= 0
worker_count = 2 #number of parallel processes
#pretrained word embeddings
pretrained_emb = "GoogleNews-vectors-negative300.bin"
#input corpus
train_corpus = "mydata.txt"
#output model
saved_path = "Googlemodel.bin"
#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %
(message)s',
level=logging.INFO)
#train doc2vec model
docs = g.doc2vec.TaggedLineDocument(train_corpus)
model = g.Doc2Vec(docs, size=vector_size, window=window_size,
min_count=min_count, sample=sampling_threshold, workers=worker_count,
hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1,
pretrained_emb=pretrained_emb, iter=train_epoch)
GoogleNews-vectors-negative300.bin
的大小是3.6gb,我的数据大小是455mb。在
运行此代码或完成培训过程后,我的输出模型只显示850MB。在
目前没有回答
相关问题 更多 >
编程相关推荐