from __future__ import print_function
import numpy as np
import json
with open("kickstarter.jsonlist") as f:
dataset = json.loads(f.readlines()[0])
np.random.shuffle(dataset) #just for fun :-)
print("There are {} projects in the dataset".format(len(dataset)))
print(type(dataset[0]))
print(dataset[0].keys())
for i in range(5):
print("{}: {}".format(dataset[i]['name'],
"Success" if dataset[i]['result'] else "Failure"))
print(dataset[i]['url'] + "\n")
print("Success rate of Kickstarter projects: {}/{}".format(len([i for i, d in enumerate(dataset) if d['result']]),
len(dataset)))
Let's make a histogram!
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist([sum([x['num_backers'] for x in y['rewards']]) for y in dataset],bins=100)#,log=True)
plt.show()
n_backers = np.array([sum([x['num_backers'] for x in y['rewards']]) for y in dataset])
n_backers = np.sort(n_backers)
n_backers = n_backers[:-1000]
plt.hist(n_backers, bins = 100, log = True)
plt.show()
print(set([x['category'] for x in dataset]))
print(set([x['sub_category'] for x in dataset]))
from collections import defaultdict
cat_to_proj = defaultdict(list)
for p in dataset:
cat_to_proj[p['category']].append(p)
for c, projs in cat_to_proj.iteritems():
print("{}: {} ({:.3f}%)".format(c, len(projs), 100.*len([p for p in projs if p['result']==1])/len(projs)))