from __future__ import print_function
import numpy as np
import json
import warnings
warnings.filterwarnings("ignore") #just for some sklearn stuff.

with open("kickstarter.jsonlist") as f:
    data = [x for x in json.loads(f.readlines()[0]) if len(x['text'].split()) > 50]
print("{} projects loaded".format(len(data)))
np.random.shuffle(data) #to prove i'm not cheating...
print(data[0].keys())

45129 projects loaded
[u'raised', u'sub_category', u'text', u'creator_num_backed', u'featured', u'result', u'duration', u'category', u'goal', u'creator_facebook_connect', u'projectId', u'lon', u'has_video', u'comments', u'faqs', u'start_date', u'rewards', u'end_date', u'parent_category', u'updates', u'lat', u'short_text', u'name', u'url', u'backers']

Here's a trick -- you can encode categorical variables as indicator features! We'll use this strategy for the project's starting year.¶

year_to_index = {y:i for i,y in enumerate(set([x['start_date'][:4]
                                              for x in data]))}
index_to_year = {i:y for y,i in year_to_index.iteritems()}

ordered_years = [index_to_year[i] for i in range(len(index_to_year))]

print(ordered_years)

[u'2009', u'2011', u'2010', u'2012']

def control_features(p_in):
    '''Extracts some basic control features for a non-language baseline'''
    names = []
    vals = []
    
    #Was the project "featured" on kickstarter?
    names.append("was_featured")
    vals.append(p_in["featured"])
    
    #Did the creator have facebook connected to their kickstarter?
    names.append("had_facebook")
    vals.append(p_in["creator_facebook_connect"])
    
    #Did the creator have a video?
    names.append("had_video")
    vals.append(p_in["has_video"])
    
    #How much money were they trying to raise? log-scaled.
    names.append("log_goal")
    vals.append(np.log(p_in["goal"]))

    #How many rewards did they offer?
    names.append('rewards')
    vals.append(len(p_in['rewards']))
    
    #What is the project's year?
    names.extend(['created_' + y for y in ordered_years])
    year_indicator = np.zeros(len(ordered_years))
    year_indicator[year_to_index[p_in['start_date'][:4]]] = 1
    vals.extend(year_indicator)

    return names, np.array(vals, dtype = np.float32)

names, vals = control_features(data[0])
print(data[0]['name'])
for n,v in zip(names, vals):
    print("{}:{:.3f}".format(n,v))

'Cockney: Regeneration'. London's last days (before the Olympics). A documentary
was_featured:0.000
had_facebook:0.000
had_video:0.000
log_goal:8.517
rewards:6.000
created_2009:0.000
created_2011:0.000
created_2010:1.000
created_2012:0.000

Y = np.array([p['result'] for p in data],dtype = np.float32)
Xcontrol = np.vstack([control_features(p)[1] for p in data])
print(Y.shape)
print(Xcontrol.shape)
bl_acc = np.sum(Y==1.)*1./len(Y)
print("Baseline 1: Constant Prediction -- {:.4f}".format(np.max([bl_acc, 1.-bl_acc])))

(45129,)
(45129, 9)
Baseline 1: Constant Prediction -- 0.5187

from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

def get_cv_acc(X_in, Y_in, cv = 10):
    accs = []
    for split in range(cv):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_in,
                                                                             Y_in,
                                                                             test_size=.2,
                                                                             random_state=split)
        #you should optimize hyperparameters, but that's another story for another day.
        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        accs.append(accuracy_score(y_test, lr.predict(X_test)))
    return np.mean(accs), lr

acc, model_example = get_cv_acc(Xcontrol, Y)
print("Baseline 2: Control features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))

Baseline 2: Control features -- 0.6655

was_featured: 2.5285
had_facebook: 0.0256
had_video: 0.8398
log_goal: -0.6350
rewards: 0.1360
created_2009: 0.5919
created_2011: 0.7834
created_2010: 0.7283
created_2012: 0.6719
bias: 2.7755

# Lets see if normalizing feature-wise helps! This makes interpretability a bit tricker...
from sklearn.preprocessing import normalize
Xcontrol_norm = normalize(Xcontrol, axis=0)
acc, model_example = get_cv_acc(Xcontrol_norm, Y)
print("Baseline 3: Normalized Control features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))

Baseline 3: Normalized Control features -- 0.5188

was_featured: 9.9851
had_facebook: 0.0632
had_video: 5.0117
log_goal: -3.1953
rewards: 5.7406
created_2009: 0.0227
created_2011: 1.8019
created_2010: -0.3952
created_2012: -1.8250
bias: 0.0308

Well, okay, let's not do that then (for now).

Lets add some topic model features!¶

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
import string
exclude = set(string.punctuation)
docs = [''.join([ch for ch in p['text'].lower() + " " + p['name'].lower()
                if ch not in exclude]) for p in data]
cv = CountVectorizer(stop_words = 'english', max_df=.7, min_df=50,
                     max_features=6000)
counts = cv.fit_transform(docs)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

n_topic = 15
print(counts.shape)
model = LDA(n_topics=n_topic, max_iter=10, n_jobs=4, verbose=1)
res = model.fit_transform(counts)

(45129, 6000)

[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   10.9s finished

feature_names = cv.get_feature_names()
print_top_words(model, feature_names, 20)

Topic #0:
goal help project kickstarter make pledge money thank support rewards need new raise want time reach dont receive like friends
Topic #1:
design use product iphone app production designs project designed kickstarter case ipad using new available products video like make software
Topic #2:
like make just want im people time need going know really money help way good little dont ive things love
Topic #3:
documentary project people travel american history stories women world video interviews series america trip country culture years time states journey
Topic #4:
tour new city party york help 2011 road house music summer event shows 2012 fans austin money year rock band
Topic #5:
film production short crew films movie feature cast equipment festival director project festivals shoot money budget sound producer shooting make
Topic #6:
life world project help people story time love make hope way support thank like goal want need believe lives family
Topic #7:
game games play cards players new card player level character world characters team time fun set zombie playing development kickstarter
Topic #8:
school students children community people world social kids program public project education schools earth human work learn research youth change
Topic #9:
light black water project white space camera man piece power 3d burning time set used large wood metal ship lights
Topic #10:
art food project local community work create space artists materials gallery help new artist fashion collection exhibition painting business small
Topic #11:
book print books project art printing comic work novel cover edition magazine writing publishing issue printed copies series pages published
Topic #12:
story man life years family young old death girl new love father night john day time dark written home world
Topic #13:
new artists dance festival arts work theatre performance production theater play art york company stage san music musical 2011 support
Topic #14:
music album record songs recording cd band studio new song video musicians release ep project recorded sound time mastering years

These are the distributions of each topic by documents¶

print(res.shape)
np.set_printoptions(precision=3)
res = normalize(res, axis=1, norm = 'l1')
print(res[:3,:])
print(np.sum(res[:3,:],axis=1))

(45129, 15)
[[  6.601e-04   6.601e-04   6.601e-04   3.437e-01   6.601e-04   2.936e-02
    1.239e-01   1.662e-02   1.692e-01   6.601e-04   7.678e-02   6.601e-04
    6.601e-04   2.353e-01   6.601e-04]
 [  8.027e-02   3.745e-04   4.275e-02   4.248e-02   5.568e-02   3.745e-04
    7.478e-02   3.745e-04   3.745e-04   5.252e-02   3.745e-04   3.745e-04
    3.745e-04   6.485e-01   3.745e-04]
 [  4.357e-04   4.357e-04   3.303e-01   1.232e-02   4.357e-04   4.357e-04
    9.546e-02   4.969e-02   1.892e-01   4.357e-04   4.357e-04   2.936e-01
    2.592e-02   4.357e-04   4.357e-04]]
[ 1.  1.  1.]

Xall = np.hstack([Xcontrol, res])
print(Xall.shape)

(45129, 24)

acc, model_example = get_cv_acc(res, Y)
print("Baseline 4: Language features -- {:.4f}".format(acc))
print()
for feat, coef in zip(['topic-{}'.format(i) for i in range(n_topic)] + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))

Baseline 4: Language features -- 0.6161

topic-0: 2.1993
topic-1: -2.0803
topic-2: 0.1134
topic-3: -0.6508
topic-4: 0.3789
topic-5: 0.0496
topic-6: -0.9671
topic-7: -0.7818
topic-8: -0.6217
topic-9: 0.6491
topic-10: -0.2392
topic-11: -0.5681
topic-12: -0.2334
topic-13: 2.1574
topic-14: 0.5291
bias: -0.0657

acc, model_example = get_cv_acc(Xall, Y)
print("Baseline 5: All features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['topic-{}'.format(i) for i in range(n_topic)] + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))

Baseline 5: All features -- 0.6881

was_featured: 2.6396
had_facebook: 0.0627
had_video: 0.8117
log_goal: -0.5913
rewards: 0.1268
created_2009: 0.4540
created_2011: 0.6764
created_2010: 0.5934
created_2012: 0.6037
topic-0: 1.7321
topic-1: -1.1163
topic-2: -0.1198
topic-3: -0.0102
topic-4: -0.0161
topic-5: 0.2327
topic-6: -0.4915
topic-7: -0.7442
topic-8: 0.2606
topic-9: 0.6220
topic-10: 0.0240
topic-11: -1.0359
topic-12: 0.2107
topic-13: 2.2725
topic-14: 0.5070
bias: 2.3275