# Info/CS 4300: Language and Information - in-class demo

#Lecture 23

## Sentiment analysis 
### Building lexicons tailored to a domain for which we don't have sentiment labels

In [3]:
%matplotlib inline

from __future__ import print_function
import json
from operator import itemgetter
from collections import defaultdict

from matplotlib import pyplot as plt
import numpy as np

from nltk.tokenize import TreebankWordTokenizer
from nltk import FreqDist,pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.naive_bayes import MultinomialNB

tokenizer = TreebankWordTokenizer()


Using the movie review data, but this time we will not use the sentiment labels (we will pretend we don't have labels).

In [6]:
## loading movie review data: 
## http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
data = load_files('txt_sentoken')
print(data.data[0])

arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . 
it's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? 
once again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . 
in this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . 
with the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! 
parts of this are actually so absurd , that they would fit right in with dogma . 
yes , the film is that wea

In [7]:
## building the term documnet matrix
vec = CountVectorizer(min_df = 50)
X = vec.fit_transform(data.data)
terms = vec.get_feature_names()
len(terms)

2153

We want to only look at adjectives and adverbs. 

We will use the NLTK part of speech tokenizer.

We want to only keep words that are taged as "JJ" (adjectives) or "RB" (adverbs).

In [61]:
##example part of speech (POS) tagging (note that you need to tokenize the sentence first)
pos_tag(tokenizer.tokenize("This was a great day but the time is running out fast"))

[('This', 'DT'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('great', 'JJ'),
 ('day', 'NN'),
 ('but', 'CC'),
 ('the', 'DT'),
 ('time', 'NN'),
 ('is', 'VBZ'),
 ('running', 'VBG'),
 ('out', 'RP'),
 ('fast', 'JJ')]

In [9]:
## POS tagging  all reviews
## POS tagging is relatively slow, so this will take a while
#reviews_pos_tagged=[pos_tag(tokenizer.tokenize(m)) for m in data.data]

## Reconstructing adjective-and-adverb-only reviews
reviews_adj_adv_only=[" ".join([w for w,tag in m if tag in ["RB","JJ"]])
                      for m in reviews_pos_tagged]

In [10]:
## It kind of works:
reviews_adj_adv_only[1]

"good hard great rare rare strong masterful together virtually unheard true real married n't much enough david american anti-government only forward available highly operative wrong always terry surprising david own notable very simple complex character-driven well-written long sharply not caruso b-movie caruso too many memorable stoic memorable extremely well skillfully old-school the"

In [84]:
## term doc matrix only for adj/adv
X = vec.fit_transform(reviews_adj_adv_only)
X = X > 0  # we only keep binary values (is the word in the document)
terms = vec.get_feature_names()

In [85]:
len(terms)

483

In [86]:
# PMI type measure via matrix multiplication
def getcollocations_matrix(X):
    XX=X.T.dot(X)  ## multiply X with it's transpose to get number docs in which both w1 (row) and w2 (column) occur
    term_freqs = np.asarray(X.sum(axis=0)) ## number of docs in which a word occurs
    #pmi=np.array(XX) * 1.0 / np.array(X.sum(axis=0)).T / np.array(X.sum(axis=0))
    pmi = XX.toarray() * 1.0  ## Casting to float, making it an array to use simple operations
    pmi /= term_freqs.T ## dividing by the number of documents in which w1 occurs
    pmi /= term_freqs  ## dividing by the number of documents in which w2 occurs
    
    return pmi  # this is not technically PMI beacuse we are ignoring some normalization factor and not taking the log 
                # but it's sufficient for ranking

In [87]:
pmi_matrix=getcollocations_matrix(X)
a.shape  # n_words by n_words

(483, 483)

In [88]:
a

array([[ 0.00399405,  0.00053261,  0.00085641, ...,  0.00061296,
         0.00066274,  0.00049234],
       [ 0.00053261,  0.01697531,  0.00082139, ...,  0.00045094,
         0.00042829,  0.00057458],
       [ 0.00085641,  0.00082139,  0.00670598, ...,  0.00069823,
         0.00045   ,  0.00055221],
       ..., 
       [ 0.00061296,  0.00045094,  0.00069823, ...,  0.00902344,
         0.00044339,  0.00087074],
       [ 0.00066274,  0.00042829,  0.00045   , ...,  0.00044339,
         0.00298861,  0.00054673],
       [ 0.00049234,  0.00057458,  0.00055221, ...,  0.00087074,
         0.00054673,  0.00278998]])

In [89]:
pmi_matrix[:,1].ravel().tolist()

[5.14668039114771e-05,
 0.0002227667631989307,
 8.991188635137565e-05,
 0.00026652452025586353,
 6.692992436918547e-05,
 0.00011940298507462687,
 2.6002392220084247e-05,
 3.0030931859815612e-05,
 0.00013568521031207597,
 0.0002261420171867933,
 0.00013819789939192924,
 0.00012756729174639623,
 2.5426530041445244e-05,
 0.00010974539069359087,
 5.7185337679418995e-05,
 1.3935922627757571e-05,
 3.503608716978488e-05,
 5.632216277105041e-05,
 0.00017768301350390902,
 0.00014490653528474132,
 0.000292654375182909,
 0.00024073182474723158,
 0.0002487562189054726,
 0.00029850746268656717,
 0.0002261420171867933,
 8.15594160345812e-05,
 0.00020169423154497784,
 2.8757944382135565e-05,
 0.0002227667631989307,
 0.0002227667631989307,
 0.0002227667631989307,
 0.00026184865147944484,
 3.2731081434930606e-05,
 0.0001243781094527363,
 2.438786459857575e-05,
 1.940880771694195e-05,
 0.00021949078138718174,
 0.00015076134479119556,
 0.00013948946854512484,
 8.577800651912849e-05,
 0.000204457166223676

In [90]:
"worse" in terms

False

In [91]:
def getcollocations(w):
    if w not in terms:
        return []
    idx = terms.index(w)
    col = a[:,idx].ravel().tolist()
    return sorted([(terms[i],val) for i,val in enumerate(col)],key=itemgetter(1),reverse=True)

In [106]:
## words that are close to "good", not enough info yet
getcollocations("good")

[(u'good', 0.0012990019157613248),
 (u'sean', 0.0009894664672151583),
 (u'nicely', 0.0009215728176087187),
 (u'forward', 0.0008879991787290832),
 (u'fairly', 0.0008726003490401396),
 (u'sad', 0.0008549720591605408),
 (u'pretty', 0.0008460801423536256),
 (u'stupid', 0.0008334223741852762),
 (u'technical', 0.0008266740148801322),
 (u'totally', 0.0008214479147860624),
 (u'shot', 0.000813992862910578),
 (u'sadly', 0.0008132974126976058),
 (u'average', 0.0008102717526801297),
 (u'intelligent', 0.0007956062005954214),
 (u'horrible', 0.0007921177925752724),
 (u'naturally', 0.0007839768760907504),
 (u'terrific', 0.0007831028773437151),
 (u'nice', 0.0007824948782153426),
 (u'therefore', 0.0007769729135288914),
 (u'thankfully', 0.0007742791829511099),
 (u'acting', 0.000772321679896414),
 (u'lovely', 0.0007690714940692756),
 (u'present', 0.0007649418644183042),
 (u'bad', 0.0007640757922921771),
 (u'climactic', 0.0007468867394326619),
 (u'really', 0.0007457428528542657),
 (u'suspenseful', 0.000744

In [107]:
## suming scores from a list of seed words for which we know the polarity
def seed_score(pos_seed):
    score=defaultdict(int)
    for seed in pos_seed:
        c=dict(getcollocations(seed))
        for w in c:
            score[w]+=c[w]
    return score

In [108]:
# words that are closest to the seed set (still many negatives in there, so we need some more work)
sorted(seed_score(['good','great','perfect','cool']).items(),key=itemgetter(1),reverse=True)

[(u'cool', 0.01836803051789725),
 (u'perfect', 0.014235784691719532),
 (u'generally', 0.004914620304679139),
 (u'great', 0.0044212228153536195),
 (u'shallow', 0.004387519939375499),
 (u'green', 0.004031276633069313),
 (u'quiet', 0.0038389323602368874),
 (u'sadly', 0.0037656812902670915),
 (u'cold', 0.00372639619058579),
 (u'eccentric', 0.003566246811727291),
 (u'anyway', 0.0035435598196591803),
 (u'mary', 0.003528786619802258),
 (u'like', 0.0034720852363373366),
 (u'willing', 0.0034463352861676677),
 (u'overall', 0.003416962030468515),
 (u'off', 0.0033758301235972932),
 (u'visually', 0.0033742110974529374),
 (u'therefore', 0.0033455317539115297),
 (u'close', 0.0033320331697695668),
 (u'sad', 0.0033247711223444326),
 (u'nicely', 0.00329676987067692),
 (u'entirely', 0.0032905492598513672),
 (u'intelligent', 0.0032582896033018925),
 (u'lovely', 0.0032553034585332107),
 (u'surely', 0.0032499006749901536),
 (u'totally', 0.003242437962550958),
 (u'minor', 0.0032310265819358213),
 (u'slowly',

In [109]:
posscores=seed_score(['good','great','perfect','cool'])
negscores=seed_score(['bad','terrible','wrong',"crap"])

## sentiment polarity score will be the difference between the words that are close to the positive seed
## and the words that are close to the negative seed
sentscores={}
for w in terms:
    sentscores[w]=posscores[w]-negscores[w]

    

In [104]:
sorted(sentscores.items(),key=itemgetter(1),reverse=False)

[(u'terrible', -0.009855717788299525),
 (u'wrong', -0.002892807170410995),
 (u'laughable', -0.0022372681494660608),
 (u'frankly', -0.0013849740220763036),
 (u'bad', -0.0013658125844714167),
 (u'poorly', -0.0013222754461456841),
 (u'anywhere', -0.0012546869557127096),
 (u'ugly', -0.001176343204772265),
 (u'current', -0.001117276129549125),
 (u'successfully', -0.001037488811300721),
 (u'unfunny', -0.0010133916696818575),
 (u'foreign', -0.000904334983761761),
 (u'sole', -0.0008546952219981477),
 (u'terribly', -0.0007390733994578876),
 (u'oddly', -0.0007073190373175831),
 (u'total', -0.0006894899408381215),
 (u'military', -0.0006728887380600891),
 (u'positive', -0.000609054423130711),
 (u'pathetic', -0.0005971148063656133),
 (u'awful', -0.000573324177411706),
 (u'earth', -0.0005105042292985202),
 (u'unnecessary', -0.0005082660511734676),
 (u'about', -0.0004718460547538108),
 (u'graphic', -0.00043745568496709455),
 (u'recently', -0.0004229737674965365),
 (u'critic', -0.00042093904961075274)

We got a reasonably good sentiment lexicon tailored to the specific data we are working without using any labels!  These lexicons are very similar to the ones we obtained in last lecture when we used the labels, only that this method can be applied to dataset where we do not have any sentiment annotations.