from __future__ import print_function
import numpy as np
import json
import warnings
warnings.filterwarnings("ignore") #just for some sklearn stuff.
with open("kickstarter.jsonlist") as f:
data = [x for x in json.loads(f.readlines()[0]) if len(x['text'].split()) > 50]
print("{} projects loaded".format(len(data)))
np.random.shuffle(data) #to prove i'm not cheating...
print(data[0].keys())
year_to_index = {y:i for i,y in enumerate(set([x['start_date'][:4]
for x in data]))}
index_to_year = {i:y for y,i in year_to_index.iteritems()}
ordered_years = [index_to_year[i] for i in range(len(index_to_year))]
print(ordered_years)
def control_features(p_in):
'''Extracts some basic control features for a non-language baseline'''
names = []
vals = []
#Was the project "featured" on kickstarter?
names.append("was_featured")
vals.append(p_in["featured"])
#Did the creator have facebook connected to their kickstarter?
names.append("had_facebook")
vals.append(p_in["creator_facebook_connect"])
#Did the creator have a video?
names.append("had_video")
vals.append(p_in["has_video"])
#How much money were they trying to raise? log-scaled.
names.append("log_goal")
vals.append(np.log(p_in["goal"]))
#How many rewards did they offer?
names.append('rewards')
vals.append(len(p_in['rewards']))
#What is the project's year?
names.extend(['created_' + y for y in ordered_years])
year_indicator = np.zeros(len(ordered_years))
year_indicator[year_to_index[p_in['start_date'][:4]]] = 1
vals.extend(year_indicator)
return names, np.array(vals, dtype = np.float32)
names, vals = control_features(data[0])
print(data[0]['name'])
for n,v in zip(names, vals):
print("{}:{:.3f}".format(n,v))
Y = np.array([p['result'] for p in data],dtype = np.float32)
Xcontrol = np.vstack([control_features(p)[1] for p in data])
print(Y.shape)
print(Xcontrol.shape)
bl_acc = np.sum(Y==1.)*1./len(Y)
print("Baseline 1: Constant Prediction -- {:.4f}".format(np.max([bl_acc, 1.-bl_acc])))
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
def get_cv_acc(X_in, Y_in, cv = 10):
accs = []
for split in range(cv):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_in,
Y_in,
test_size=.2,
random_state=split)
#you should optimize hyperparameters, but that's another story for another day.
lr = LogisticRegression()
lr.fit(X_train, y_train)
accs.append(accuracy_score(y_test, lr.predict(X_test)))
return np.mean(accs), lr
acc, model_example = get_cv_acc(Xcontrol, Y)
print("Baseline 2: Control features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
print("{}: {:.4f}".format(feat, coef))
# Lets see if normalizing feature-wise helps! This makes interpretability a bit tricker...
from sklearn.preprocessing import normalize
Xcontrol_norm = normalize(Xcontrol, axis=0)
acc, model_example = get_cv_acc(Xcontrol_norm, Y)
print("Baseline 3: Normalized Control features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
print("{}: {:.4f}".format(feat, coef))
Well, okay, let's not do that then (for now).
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
import string
exclude = set(string.punctuation)
docs = [''.join([ch for ch in p['text'].lower() + " " + p['name'].lower()
if ch not in exclude]) for p in data]
cv = CountVectorizer(stop_words = 'english', max_df=.7, min_df=50,
max_features=6000)
counts = cv.fit_transform(docs)
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
n_topic = 15
print(counts.shape)
model = LDA(n_topics=n_topic, max_iter=10, n_jobs=4, verbose=1)
res = model.fit_transform(counts)
feature_names = cv.get_feature_names()
print_top_words(model, feature_names, 20)
print(res.shape)
np.set_printoptions(precision=3)
res = normalize(res, axis=1, norm = 'l1')
print(res[:3,:])
print(np.sum(res[:3,:],axis=1))
Xall = np.hstack([Xcontrol, res])
print(Xall.shape)
acc, model_example = get_cv_acc(res, Y)
print("Baseline 4: Language features -- {:.4f}".format(acc))
print()
for feat, coef in zip(['topic-{}'.format(i) for i in range(n_topic)] + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
print("{}: {:.4f}".format(feat, coef))
acc, model_example = get_cv_acc(Xall, Y)
print("Baseline 5: All features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['topic-{}'.format(i) for i in range(n_topic)] + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
print("{}: {:.4f}".format(feat, coef))