from __future__ import print_function
import json
import numpy as np
from sklearn.cross_validation import ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix
with open("kardashian-transcripts.json", "rb") as f:
transcripts = json.load(f)
kris_msgs = [m['text'] for transcript in transcripts for m in transcript
if m['speaker'] == 'KRIS']
bruce_msgs = [m['text'] for transcript in transcripts for m in transcript
if m['speaker'] == 'BRUCE']
kris_classes = ['Kris' for _ in kris_msgs]
bruce_classes = ['Bruce' for _ in bruce_msgs]
msgs = kris_msgs + bruce_msgs
msgs = np.array(msgs)
classes = kris_classes + bruce_classes
classes = np.array(classes)
nr_mesages=len(msgs)
shuffle_split = ShuffleSplit(nr_mesages, test_size=0.5, random_state=0)
train_idx, test_idx = next(iter(shuffle_split)) #iterator
train_idx[:10]
msgs_train = msgs[train_idx]
msgs_test = msgs[test_idx]
classes_train = classes[train_idx]
classes_test = classes[test_idx]
First, let's see how well we can do without looking at the features at all. We will learn the majority class from the train set and always predict that
n_kris=len([1 for c in classes_train if c == "Kris"])
n_bruce=len([1 for c in classes_train if c == "Bruce"])
print("number of messages in Kris class:",n_kris)
print("number of messages in Bruce class:",n_bruce)
if n_kris>n_bruce:
print("Majority class is Kris")
majority_class="Kris"
else:
print("Majority class is Bruce")
majority_class="Bruce"
n_train = len(classes_train)
##always predict majority class
majority_classes_test= [majority_class for _ in msgs_test]
print("Majority baseline accuracy: {:.2f}%".format(np.mean(majority_classes_test == classes_test) * 100))
## getting term-doc matrix
vectorizer = CountVectorizer(ngram_range=(1, 2)) # for unigrams only use ngram_range=(1, 1)
vectorizer.fit(msgs_train)
terms = vectorizer.get_feature_names()
terms[-10:]
term_document_matrix_train = vectorizer.transform(msgs_train)
term_document_matrix_train
print(confusion_matrix(classes_test, majority_classes_test))
#classifier=MultinomialNB()
classifier=BernoulliNB(alpha=1)
classifier.fit(term_document_matrix_train,classes_train)
term_document_matrix_test = vectorizer.transform(msgs_test)
len_test = [len(d) for d in msgs_test]
predicted_classes_test = classifier.predict(term_document_matrix_test)
predicted_classes_train = classifier.predict(term_document_matrix_train)
print("Accuracy: {:.2f}%".format(np.mean(predicted_classes_train == classes_train) * 100))
print("Accuracy: {:.2f}%".format(np.mean(predicted_classes_test == classes_test) * 100))
classifier.classes_ #checking the order of the classes (for the confusion matrix)
print(confusion_matrix(classes_test, predicted_classes_test))
print(classification_report(classes_test, predicted_classes_test))
print(terms[-5:])
classifier.feature_count_[:, -5:] #classes by words
# P(feature|Bruce)
bruce_probs=classifier.feature_log_prob_[0,:]
# P(feature|Kris)
kris_probs=classifier.feature_log_prob_[1,:]
logodds=bruce_probs-kris_probs
#kris
print("\nFeatures that are most indicative of Kris:\n")
for i in np.argsort(logodds)[:10]:
print(terms[i])
print("\n\nFeatures that are most indicative of Bruce\n")
#bruce
for i in np.argsort(-logodds)[:10]:
print(terms[i])