kohlmeier · January 4, 2016 15:49 · Jan 29, 2014 · Jan 27, 2014
diff --git a/tasks_per_mission.py b/tasks_per_mission.py
@@ -3,7 +3,7 @@
 don't have time to list them all.  But here's a few. :)
 
 *) The user_mission (and thus the mission) associated with each LearningTask
-   is the task at the time of the creation.  For MasteryChallenges, this
+   is the mission at the time of the creation.  For MasteryChallenges, this
    may not be a problem-- I'm not sure.  But if this was extended to work
    on, say, PracticeTasks, a user could create the task in one mission, then
    switch missions and actually do the problems in another misison.  This

diff --git a/tasks_per_mission.py b/tasks_per_mission.py
@@ -0,0 +1,98 @@
+"""This script is hack to get a quick idea of how many mastery challenges
+are being done under each mission.  It's hacky for a lot of reasons.. I
+don't have time to list them all.  But here's a few. :)
+
+*) The user_mission (and thus the mission) associated with each LearningTask
+   is the task at the time of the creation.  For MasteryChallenges, this
+   may not be a problem-- I'm not sure.  But if this was extended to work
+   on, say, PracticeTasks, a user could create the task in one mission, then
+   switch missions and actually do the problems in another misison.  This
+   script would not understand that.
+*) There may be a lot of double counting in here, because this script works
+   off the data pipeline which pulls data from the datastore that has
+   been modified in the last hour.  Thus, if someone works on a MasteryTask
+   across an hour boundary on the wall clock, there is likely to be
+   multiple copies of that entity stored on disk.  They get merged later
+   when loading into hive, but that hasn't happened yet in the data
+   this script operates on.  Why am I not freaked out about this?  Because
+   I'm mostly interested in the *relative* count between missions, and
+   I'm hoping that the double counting occurs the same amount in all missions.
+
+USAGE:
+   mastery_tasks_per_mission.py [YYYY-MM-DD]
+
+It will output a single CSV line summarizing the counts for each day, e.g.,:
+YYYY-MM-DD,num_allofmath_mts,num_othermission_mts
+
+If no day is provided as a command line argument, it will process all days
+for which data is present on the anlaytics machine.
+
+It should be run from the ~/analytics/src diretory on the analytics machine.
+"""
+
+from collections import defaultdict
+import glob
+import json
+import os
+import pickle
+import sys
+
+import gae_util
+gae_util.fix_sys_path()
+from google.appengine.api import datastore
+from google.appengine.datastore import entity_pb
+
+
+ALL_OF_MATH_KEY = 'ag5zfmtoYW4tYWNhZGVteXIWCxIHTWlzc2lvbiIJYWxsb2ZtYXRoDA'
+DATA_PATH = '/ebs/kadata2/daily_new/'
+DATA_PATH_FULL = DATA_PATH + '%s/LearningTask/*.pickle.gz'
+
+stats = defaultdict(int)
+
+def process_file(filename, stats):
+	"""This function takes the name of a gzipped file of a pickled list of
+	protobufs.  Phew, that's mouthful!  It increments the counts of
+	mastery tasks in the 'stats' argument appropriately.
+	"""
+
+	print >> sys.stderr, filename
+	os.system("gunzip --stdout %s > /ebs/modeling/jace/temp/lt.pickle" % filename)
+
+	with open('/ebs/modeling/jace/temp/lt.pickle', 'rb') as pkl_file:
+		pb_list = pickle.load(pkl_file)
+
+	for pb in pb_list:
+		entity = datastore.Entity._FromPb(entity_pb.EntityProto(pb))
+		if 'MasteryTask' in entity['class']:
+			if 'user_mission' in entity and entity['user_mission']:
+				mission = entity['user_mission'].id_or_name().split(':')[-1]
+			else:
+				mission = "NONE"
+			stats[mission] += 1
+		else:
+			pass  # print "non-mastery task"
+
+
+if len(sys.argv) < 2:
+	dirnames = glob.glob(DATA_PATH + '*')
+	dirnames = sorted([os.path.basename(d) for d in dirnames])
+else:
+	dirnames = [sys.argv[1]]
+print >> sys.stderr, dirnames
+
+for day in dirnames:
+	# NOTE: day is a directory name, but will be in the format YYYY-MM-DD
+
+	stats = defaultdict(int)
+
+	# Get all the file names for the day, and process them
+	filenames = glob.glob(DATA_PATH_FULL % day)
+	for filename in filenames:
+		process_file(filename, stats)
+	print >> sys.stderr, json.dumps(dict(stats), indent=4)
+
+	all_of_math_count = stats[ALL_OF_MATH_KEY]
+	other_mission_count = sum(stats.values()) - all_of_math_count
+
+	# output the line to stdout
+	print "%s, %d, %d" % (day, all_of_math_count, other_mission_count)
No results found