Python

word2vec 실습

learning-log22 2025. 5. 24. 18:50
반응형
SMALL

J. K. Rowling - Harry Potter 1 - Sorcerers Stone.txt
0.42MB

import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

1. Load data
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
 
 %cd /content/drive/MyDrive/Colab Notebooks/TMNLP
/content/drive/MyDrive/Colab Notebooks/TMNLP
 
file = open("J. K. Rowling - Harry Potter 1 - Sorcerers Stone.txt",'r')
raw_data_1 = file.read()
file.close()
 
2. Preprocessing
words = raw_data_1.split()
words = [ word.lower() for word in words if len(word)>1 and word.isalpha()]

temp = np.unique(words, return_counts=True)
vocab = np.astype(temp[0][temp[1]>10], 'object')

 
char_to_int = dict((c,i) for i,c in enumerate(vocab))
int_to_char = dict((i,c) for i,c in enumerate(vocab))
 
words = [x for x in words if x in char_to_int.keys()]
len(words), len(vocab)
(47930, 639)
 
2-1. Generate the skip-gram training data with window size
vocab_size = len(vocab)
 
# Window size
window_size = 3
 
# Number of negative samples for each positivie sample
num_ns = 4
 
# Random seed
np.random.seed(1)
Q1. curr_word (target word)와 window_size 내의 다른 word(context word)로 구성된 pair를 item으로 갖는 temp_dict를 작성하세요
temp_dict = []
for i in range(len(words)):
    a = i-window_size
    b = i+window_size
    curr_word = words[i]
    context_words = words[a:i] + words[i+1:b+1]
    for context_word in context_words:
        temp_dict.append((curr_word, context_word))
 
train_df = pd.DataFrame(temp_dict)
train_df.columns = ['target','context']
pos_words = train_df.groupby('target')['context'].apply(list).to_dict()

Q2. context_list 와 label_list를 작성하세요. context_list 와 label_list는 [pos_sample, neg_sample_1, ... , neg_sample_num_ns]로 구성되어 있습니다.
아래 함수들을 이용하세요: np.random.choice(a_set_of_neg_words, number_of_neg_samples) context_ = [context] + list(negative_samples)
 
target_list = []
context_list = []
label_list = []

for i in tqdm(range(train_df.shape[0])):
    target = train_df.iloc[i]['target']
    context = train_df.iloc[i]['context']

    neg_words = list(set(vocab).difference(pos_words[target]))

    target_list.append(char_to_int[target])

    context_ = [context] + list(np.random.choice(neg_words, num_ns))
    context_list.append([char_to_int[x] for x in context_])

    label_ = [1] + [0]*num_ns
    label_list.append(label_)

3. Training
BATCH_SIZE = 64
BUFFER_SIZE = 10000
3-1. Change data to TF dataset
dataset = tf.data.Dataset.from_tensor_slices(((target_list, context_list), label_list))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(dataset)

3-2. Build model
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = tf.keras.layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = tf.keras.layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots
    
3-2. Specify the optimizer
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])
 
4. Investigation & Visualization
word_embeddings = word2vec.get_weights()[1]

Q3. 128차원 embedding vector를 이용해서 'voldemort'와 가장 가까운 10 단어를 구하세요.
V가 모든 단어의 embedding matrix, v가 voldmort의 embedding vector일 때
np.argsort(-np.matmul(V, v))는 voldmort와 dot product 값이 가장 가까운 순서대로 정렬합니다.
 
word_ids_similar_to_voldmort = np.argsort(-np.matmul(word_embeddings, word_embeddings[char_to_int['voldemort']]))[:10]
 
print([int_to_char[x] for x in word_ids_similar_to_voldmort])

Q4. 다음 단어들의 word_embedding을 2차원 PCA 평면에 출력하세요
['harry', 'voldemort', 'dumbledore', 'go', 'come', 'man', 'woman']

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
vectors_pca = pca.fit_transform(word_embeddings)
vectors_pca = pd.DataFrame(vectors_pca)vectors_pca['word'] = int_to_char.values()

ids = [char_to_int[x] for x in ['harry', 'voldemort', 'dumbledore', 'go', 'come', 'man', 'woman']]

print(ids)

vectors_pca = vectors_pca.loc[ids]

plt.scatter(vectors_pca[0], vectors_pca[1])for obs in vectors_pca.values:  plt.annotate(obs[2], (obs[0], obs[1]))plt.show()
728x90

 

728x90
LIST