from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import json
%matplotlib inline
import matplotlib.pyplot as plt
with open("movie_scripts_data.json") as f:
data = json.loads(f.readlines()[0])
print("Loaded {} movie transcripts".format(len(data)))
print("Each movie transcript is a dictionary with the following keys...")
print(data[0].keys())
print("The index of \"{}\" is {}".format(data[7]['movie_name'], movie_id_to_index[data[7]['movie_id']]))
We can see that each movie is assigned an "index" (from 0 to 616). These will correspond to the rows of a document-by-term-count matrix.
count_vec = CountVectorizer(stop_words='english', max_df=0.8, min_df=10,
max_features=1000, binary = True)
term_doc_matrix = count_vec.fit_transform([x['script'] for x in data])
print(term_doc_matrix.shape)
# word index
features = count_vec.get_feature_names()
print(features[:100])
term_doc_matrix = term_doc_matrix.toarray()
term_doc_matrix[:1].tolist()
# from sklearn.preprocessing import normalize
# doc_by_vocab2 = normalize(doc_by_vocab, axis=0)
# term_doc_matrix = doc_by_vocab.T
cooccurence_matrix = np.dot(term_doc_matrix.T, term_doc_matrix)
def find_most_similar_words(word, similarity_matrix, topk=10):
if word not in features:
print(word, 'is OOV.')
return None
idx = features.index(word)
sorted_words = np.argsort(similarity_matrix[idx])[::-1]
print('Most similar {} words to "{}" are:'.format(topk, word))
for i in range(topk):
j = sorted_words[i]
print(features[j], similarity_matrix[idx, j])
find_most_similar_words('computer', similarity_matrix = cooccurence_matrix)
Hmm, not great. The issue is that this does not account for the words occuring together by chance alone. E.g., "hell" is quite a popular word in movies. We can account for the probability of the two words co-occuring by change using (a version of) PMI.
pa = np.sum(term_doc_matrix,0)
pa.shape
PMI_part = cooccurence_matrix / pa
PMI = PMI_part.T / pa
find_most_similar_words('computer',PMI, 10)
# cooccurence_matrix / np.reshape(np.sum(term_doc_matrix, axis=0), (1000, 1))