Info/CS 4300: Language and Information - in-class demo

Text Classification

In [1]:
from __future__ import print_function
import json
import numpy as np

from sklearn.cross_validation import ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


from sklearn.metrics import classification_report, confusion_matrix
/Users/cristian/anaconda2/envs/cs4300/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Load the data

In [2]:
with open("kardashian-transcripts.json", "rb") as f:
    transcripts = json.load(f)
In [3]:
kris_msgs = [m['text'] for transcript in transcripts for m in transcript
             if m['speaker'] == 'KRIS']


bruce_msgs = [m['text'] for transcript in transcripts for m in transcript
              if m['speaker'] == 'BRUCE']
In [4]:
kris_classes = ['Kris' for _ in kris_msgs]


bruce_classes = ['Bruce' for _ in bruce_msgs]


msgs = kris_msgs + bruce_msgs
msgs = np.array(msgs)

classes = kris_classes + bruce_classes
classes = np.array(classes)

Leave out a test set

In [5]:
nr_mesages=len(msgs)

shuffle_split = ShuffleSplit(nr_mesages, test_size=0.5, random_state=0)
In [6]:
train_idx, test_idx = next(iter(shuffle_split))  #iterator

train_idx[:10]
Out[6]:
array([5815, 8522, 8344, 4095, 5408, 5757, 3221, 4114, 8621, 8788])
In [7]:
msgs_train = msgs[train_idx]
msgs_test =  msgs[test_idx]

classes_train = classes[train_idx]
classes_test = classes[test_idx]

Majority baseline

First, let's see how well we can do without looking at the features at all. We will learn the majority class from the train set and always predict that

In [8]:
n_kris=len([1 for c in classes_train if c == "Kris"])
n_bruce=len([1 for c in classes_train if c == "Bruce"])


print("number of messages in Kris class:",n_kris)
print("number of messages in Bruce class:",n_bruce)

if n_kris>n_bruce:
    print("Majority class is Kris")
    majority_class="Kris"
else:
    print("Majority class is Bruce")
    majority_class="Bruce"


n_train = len(classes_train)

##always predict majority class
majority_classes_test= [majority_class for _ in msgs_test]

print("Majority baseline accuracy: {:.2f}%".format(np.mean(majority_classes_test == classes_test) * 100))
number of messages in Kris class: 2847
number of messages in Bruce class: 2078
Majority class is Kris
Majority baseline accuracy: 56.83%

Getting features (term-doc matrix)

In [9]:
## getting term-doc matrix
vectorizer = CountVectorizer(ngram_range=(1, 2))  # for  unigrams only use ngram_range=(1, 1)
vectorizer.fit(msgs_train)

terms = vectorizer.get_feature_names()
terms[-10:]
Out[9]:
[u'yourself out',
 u'yourselves',
 u'yourselves single',
 u'youto',
 u'youto make',
 u'yum',
 u'zack',
 u'zack you',
 u'zito',
 u'zito rico']
In [10]:
term_document_matrix_train = vectorizer.transform(msgs_train)
term_document_matrix_train
Out[10]:
<4925x15630 sparse matrix of type '<type 'numpy.int64'>'
	with 57528 stored elements in Compressed Sparse Row format>
In [11]:
print(confusion_matrix(classes_test, majority_classes_test))  
[[   0 2126]
 [   0 2799]]

Building a classifier

In [26]:
#classifier=MultinomialNB()
classifier=BernoulliNB(alpha=1)

classifier.fit(term_document_matrix_train,classes_train)
Out[26]:
BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)

Evaluate

In [27]:
term_document_matrix_test = vectorizer.transform(msgs_test)
len_test = [len(d) for d in msgs_test]

predicted_classes_test = classifier.predict(term_document_matrix_test)
In [24]:
predicted_classes_train = classifier.predict(term_document_matrix_train)
print("Accuracy: {:.2f}%".format(np.mean(predicted_classes_train == classes_train) * 100))
Accuracy: 89.58%
In [25]:
print("Accuracy: {:.2f}%".format(np.mean(predicted_classes_test == classes_test) * 100))
Accuracy: 71.43%
In [16]:
classifier.classes_  #checking the order of the classes (for the confusion matrix)
Out[16]:
array(['Bruce', 'Kris'], 
      dtype='|S5')
In [17]:
print(confusion_matrix(classes_test, predicted_classes_test))  
[[ 987 1139]
 [ 311 2488]]
In [18]:
print(classification_report(classes_test, predicted_classes_test))
             precision    recall  f1-score   support

      Bruce       0.76      0.46      0.58      2126
       Kris       0.69      0.89      0.77      2799

avg / total       0.72      0.71      0.69      4925

features

In [19]:
print(terms[-5:])
classifier.feature_count_[:, -5:]   #classes by words
[u'yum', u'zack', u'zack you', u'zito', u'zito rico']
Out[19]:
array([[ 0.,  1.,  1.,  1.,  1.],
       [ 1.,  0.,  0.,  0.,  0.]])
In [116]:
# P(feature|Bruce)
bruce_probs=classifier.feature_log_prob_[0,:]

# P(feature|Kris)
kris_probs=classifier.feature_log_prob_[1,:]


logodds=bruce_probs-kris_probs

#kris
print("\nFeatures that are most indicative of Kris:\n")
for i in np.argsort(logodds)[:10]:
    print(terms[i])
    
print("\n\nFeatures that are most indicative of Bruce\n")
#bruce
for i in np.argsort(-logodds)[:10]:
    print(terms[i])
Features that are most indicative of Kris:

stop it
feels
okay well
birthday
that enough
every time
feelings
whoo
for her
her to


Features that are most indicative of Bruce

few pounds
pounds
lose few
could lose
kimberly
he been
difference
like what
hand hug
mean this
In [ ]: