NLP学习2——Word2Vec之SkipGram模型的Python实现

本篇文章参考自该教程
我在Google的Colaboratory上面使用了Google云端的GPU运行了该程序

1import torch 2import torch.nn as nn 3from torch.autograd import Variable 4import torch.optim as optim 5import torch.nn.functional as F 6import nltk 7import random 8import numpy as np 9from collections import Counter 10flatten = lambda l: [item for sublist in l for item in sublist] # 将多层列表平铺成一层列表的方法 11 # 其余平铺列表的方法可以参考该教程 12 # (https://blog.csdn.net/weixin_40539892/article/details/79103290) 13random.seed(1024) # 固定随机值 14torch.cuda.get_device_name(0) # 获得GPU的型号,'Tesla P100-PCIE-16GB' 15 16# 加载数据集 17nltk.download('gutenberg') 18nltk.corpus.gutenberg.fileids() 19nltk.download('punkt') 20corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] 21corpus = [[word.lower() for word in sent] for sent in corpus] # 将数据集中的字母转换为小写 22corpus[0:2] # 查看数据集 23 # [['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'], 24 # ['etymology', '.']] 25# 原程序介绍了一种构造stopwords的方法,我没有沿用原程序的方法,但是原程序的思想和用到的方法可以学习一下: 26# word_count = Counter(flatten(corpus)) # 构建词汇表,并对每个词和标点符号计数,输出形式:'a': 21 27# border = int(len(word_count) * 0.01) 28# stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border] most_common()方法可以对词的出现次数由多到少统计 29# stopwords = [s[0] for s in stopwords] 30from nltk.corpus import stopwords 31nltk.download('stopwords') 32stop_words = set(stopwords.words('english')) 33vocab = list(set(flatten(corpus)) - set(stop_words)) # 去除stopwords,构建词汇表 34vocab.append('<UNK>') 35print(len(set(flatten(corpus))), len(vocab)) # 592 514 36 37word2index = {'<UNK>' : 0} 38for vo in vocab: 39 if word2index.get(vo) is None: 40 word2index[vo] = len(word2index) # 构建词汇表的索引,输出形式:{'<UNK>': 0, 'history': 1, 'patient': 2, 'side': 3...} 41 42index2word = {v:k for k, v in word2index.items()} # 形式:{0: '<UNK>', 1: 'history', 2: 'patient', 3: 'side'...} 43# 设置窗口大小 44WINDOW_SIZE = 3 45# 一个序列的长度是7,其中c是中心词,遍历corpus的每个词并将其当作中心词 46windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus]) 47# 构造训练集 48train_data = [] 49for window in windows: # 遍历每个序列,提取中心词和其相邻的词 50 for i in range(WINDOW_SIZE * 2 + 1): 51 if i == WINDOW_SIZE or window[i] == '<DUMMY>': 52 continue 53 train_data.append((window[WINDOW_SIZE], window[i])) 54print(train_data[:WINDOW_SIZE * 2]) # [('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')] 55# 训练集是一个列表,然后将中心词和上下文词打包成元组存入列表 56 57def prepare_word(word, word2index): # 定义函数将词汇表中词的索引数值构造一个可进行反向传播的属性 58 return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]])) 59X_p = [] 60y_p = [] 61for tr in train_data: 62 X_p.append(prepare_word(tr[0], word2index).view(1, -1)) # 分离中心词和标签,并将其升至二维 63 y_p.append(prepare_word(tr[1], word2index).view(1, -1)) 64train_data = list(zip(X_p, y_p)) # 重新打包数据集,此时数据集存储的是数值 65len(train_data) # 7606 66 67# 下面定义模型 68class Skipgram(nn.Module): 69 70 def __init__(self, vocab_size, projection_dim): 71 super(Skipgram,self).__init__() 72 self.embedding_v = nn.Embedding(vocab_size, projection_dim) 73 self.embedding_u = nn.Embedding(vocab_size, projection_dim) 74 75 self.embedding_v.weight.data.uniform_(-1, 1) # 初始化输入向量权重 76 self.embedding_u.weight.data.uniform_(0, 0) # 初始化输出向量权重,这里不明白为什么初始为0? 77 def forward(self, center_words,target_words, outer_words): 78 center_embeds = self.embedding_v(center_words) # batch_size x 1 x n,n是嵌入维度 79 target_embeds = self.embedding_u(target_words) # batch_size x 1 x n 80 outer_embeds = self.embedding_u(outer_words) # batch_size x V x n ,V是词汇表的大小 81 82 scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # batch_size x 1 x n * batch_size x n x 1 => batch_size x 1 bmm()方法是批量矩阵乘法 83 norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # batch_size x V 84 # 定义损失函数 85 nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax 86 87 return nll # negative log likelihood 88 89 def prediction(self, inputs): 90 embeds = self.embedding_v(inputs) 91 92 return embeds 93# 开始训练 94EMBEDDING_SIZE = 30 95BATCH_SIZE = 256 96EPOCH = 100 97model = Skipgram(len(word2index), EMBEDDING_SIZE) 98if USE_CUDA: 99 model = model.cuda() 100optimizer = optim.Adam(model.parameters(), lr=0.01) 101# 定义批量处理函数 102def getBatch(batch_size, train_data): 103 random.shuffle(train_data) 104 sindex = 0 105 eindex = batch_size 106 while eindex < len(train_data): 107 batch = train_data[sindex: eindex] 108 temp = eindex 109 eindex = eindex + batch_size 110 sindex = temp 111 yield batch 112 113 if eindex >= len(train_data): 114 batch = train_data[sindex:] 115 yield batch 116# 我想用torch自带的DataLoader函数处理批量数据, 117# 但经历了一系列的数据类型转换、数据维度的改变等等处理之后,还是失败了,所以这里还是用的原程序作者自己写的批量处理函数 118def prepare_sequence(seq, word2index): # 该函数的作用是把整个词汇表的索引数值加入一个长列表,并赋予可反向传播的属性 119 idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq)) 120 return Variable(LongTensor(idxs)) 121 122for epoch in range(EPOCH): 123 for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)): 124 125 inputs, targets = zip(*batch) 126 127 inputs = torch.cat(inputs) # batch_size x 1 128 targets = torch.cat(targets) # batch_size x 1 129 vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab)) # batch_size x V 130 model.zero_grad() 131 loss = model(inputs, targets, vocabs) 132 loss.backward() 133 optimizer.step() 134 if epoch % 10 == 0: 135 print("Epoch : %d, mean_loss : %.02f" % (epoch,loss)) 136 137# 训练完成之后,我们对模型进行测试,测试用的方法是测量两个词之间的余弦相似度 138def word_similarity(target, vocab): 139 if USE_CUDA: 140 target_V = model.prediction(prepare_word(target, word2index)) 141 else: 142 target_V = model.prediction(prepare_word(target, word2index)) 143 similarities = [] 144 for i in range(len(vocab)): 145 if vocab[i] == target: continue 146 147 if USE_CUDA: 148 vector = model.prediction(prepare_word(list(vocab)[i], word2index)) 149 else: 150 vector = model.prediction(prepare_word(list(vocab)[i], word2index)) 151 cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 152 similarities.append([vocab[i], cosine_sim]) 153 return sorted(similarities, key=lambda x: x[1], reverse=True)[:10] # 对相似度从大到小排序 154 155test = random.choice(list(vocab)) 156word_similarity(test, vocab) 157

在这里插入图片描述
在这里插入图片描述

代码交流 2021