from __future__ import print_function
import json
import numpy as np
from sklearn.cross_validation import ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from scipy import sparse as sp
EXTRA=False # set to True to activate the extra feature (length in our case, but could be easily changed)
with open("kardashian-transcripts.json", "rb") as f:
transcripts = json.load(f)
transcripts[1][1]
kris_msgs = [m['text'] for transcript in transcripts for m in transcript
if m['speaker'] == 'KRIS']
## who is the previous speaker
kris_prev = [("to_"+transcript[k-1]['speaker'] if k>0 else "_T")
for transcript in transcripts for k,m in enumerate(transcript)
if m['speaker']=='KRIS']
bruce_msgs = [m['text'] for transcript in transcripts for m in transcript
if m['speaker'] == 'BRUCE']
bruce_prev = [("to_"+transcript[k-1]['speaker'] if k>0 else "_T")
for transcript in transcripts for k,m in enumerate(transcript)
if m['speaker']=='BRUCE']
bruce_prev[:10]
## we need to (a) merge multiple lines from the same person (data representation issue)
## and (b) treat both KRIS and BRUCE as unknown labels when they appear as previous speakers
kris_prev2 =[]
for m in kris_prev:
if m=="to_KRIS" or m=="to_BRUCE":
kris_prev2.append("_M")
else:
kris_prev2.append(m)
kris_prev = kris_prev2
bruce_prev2 =[]
for m in bruce_prev:
if m=="to_KRIS" or m=="to_BRUCE":
bruce_prev2.append("_M")
else:
bruce_prev2.append(m)
bruce_prev = bruce_prev2
kris_msgs = [target+" "+m for target,m in zip(kris_prev,kris_msgs)]
bruce_msgs = [target+" "+m for target,m in zip(bruce_prev,bruce_msgs)]
kris_classes = ['Kris' for _ in kris_msgs]
bruce_classes = ['Bruce' for _ in bruce_msgs]
msgs = kris_msgs + bruce_msgs
msgs = np.array(msgs)
classes = kris_classes + bruce_classes
classes = np.array(classes)
kris_msgs[:10]
## This is how zip works
zip([1,2,3],["a","b","c"])
nr_mesages=len(msgs)
shuffle_split = ShuffleSplit(nr_mesages, test_size=0.5, random_state=0)
train_idx, test_idx = next(iter(shuffle_split)) #iterator
train_idx[:10]
msgs_train = msgs[train_idx]
msgs_test = msgs[test_idx]
classes_train = classes[train_idx]
classes_test = classes[test_idx]
First, let's see how well we can do without looking at the features at all. We will learn the majority class from the train set and always predict that
n_kris=len([1 for c in classes_train if c == "Kris"])
n_bruce=len([1 for c in classes_train if c == "Bruce"])
print("number of messages in Kris class:",n_kris)
print("number of messages in Bruce class:",n_bruce)
if n_kris>n_bruce:
print("Majority class is Kris")
majority_class="Kris"
else:
print("Majority class is Bruce")
majority_class="Bruce"
n_train = len(classes_train)
##always predict majority class
majority_classes_test= [majority_class for _ in msgs_test]
print("Majority baseline accuracy: {:.2f}%".format(np.mean(majority_classes_test == classes_test) * 100))
## getting term-doc matrix
vectorizer = CountVectorizer(ngram_range=(1, 2)) # for unigrams only use ngram_range=(1, 1)
vectorizer.fit(msgs_train)
terms = vectorizer.get_feature_names()
terms[-10:]
term_document_matrix_train = vectorizer.transform(msgs_train)
term_document_matrix_train
print(confusion_matrix(classes_test, majority_classes_test))
Add extra features, such as length in characters
def extra_feature(d):
return len(d)
if EXTRA:
extra_train = [extra_feature(d) for d in msgs_train]
extra_train = np.atleast_2d(extra_train)
extra_train =extra_train.T
extra_train
if EXTRA:
## merging the extra feature vector with the old feature matrix
term_document_matrix_train = sp.hstack([extra_train,term_document_matrix_train])
#classifier=MultinomialNB()
classifier=LinearSVC()
classifier.fit(term_document_matrix_train,classes_train)
term_document_matrix_test = vectorizer.transform(msgs_test)
if EXTRA:
extra_test = [extra_feature(d) for d in msgs_test]
extra_test = np.atleast_2d(extra_test)
extra_test =extra_test.T
term_document_matrix_test = sp.hstack([extra_test,term_document_matrix_test])
predicted_classes_test = classifier.predict(term_document_matrix_test)
predicted_classes_train = classifier.predict(term_document_matrix_train)
print("Accuracy on train: {:.2f}%".format(np.mean(predicted_classes_train == classes_train) * 100))
print("Accuracy on test: {:.2f}%".format(np.mean(predicted_classes_test == classes_test) * 100))
classifier.classes_ #checking the order of the classes (for the confusion matrix)
print(confusion_matrix(classes_test, predicted_classes_test))
print(classification_report(classes_test, predicted_classes_test))