2
我正在使用gensim.models.doc2vec import Doc2Vec
使用兩個完全相同的句子(文檔)進行訓練,並且在檢查每個句子的向量時,它們是完全不同的。神經網絡是否有不同的隨機初始化?爲什麼gensim Doc2Vec爲同一個句子提供不同的載體?
# imports
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import Doc2Vec
from gensim import utils
# Document iteration class (turns many documents in to sentences
# each document being once sentence)
class LabeledDocs(object):
def __init__(self, sources):
self.sources = sources
flipped = {}
# make sure that keys are unique
for key, value in sources.items():
if value not in flipped:
flipped[value] = [key]
else:
raise Exception('Non-unique prefix encountered')
def __iter__(self):
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
# print fin.read().strip(r"\n")
yield LabeledSentence(utils.to_unicode(fin.read()).split(),
[prefix])
def to_array(self):
self.sentences = []
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
#print fin, fin.read()
self.sentences.append(
LabeledSentence(utils.to_unicode(fin.read()).split(),
[prefix]))
return self.sentences
# play and play3 are names of identical documents (diff gives nothing)
inp = LabeledDocs({"play":"play", "play3":"play3"})
model = Doc2Vec(size=20, window=8, min_count=2, workers=1, alpha=0.025,
min_alpha=0.025, batch_words=1)
model.build_vocab(inp.to_array())
for epoch in range(10):
model.train(inp)
# post to this model.docvecs["play"] is very different from
# model.docvecs["play3"]
這是爲什麼?無論play
和play3
包含:
foot ball is a sport
played with a ball where
teams of 11 each try to
score on different goals
and play with the ball