Analysis of Quiz Results in Business Analytics

Libraries and data preparation

In [227]:
import mysql.connector as mys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [228]:
# connect to database
# this is the same MySQL database used for other things in the course
# the 'student' login has restricted access
# the quiz data is updated from time from time using a different program that reads from the Canvas API

cTBL = mys.connect(host='datalab2.bus.sfu.ca',
                    user='student',
                    password='SFU!',
                    database='CanvasTBL')
In [229]:
# get quiz results for a particular term
quizzes = pd.read_sql('''SELECT CourseName, QuizTitle, Percent, TimeSpent
    FROM vwQuizResults WHERE TermName='Summer 2020' ''' , cTBL)
In [230]:
# note that data points are anonymous--no student information
quizzes
Out[230]:
CourseName QuizTitle Percent TimeSpent
0 BUS553 G200 Business Analytics Quiz: Module 02 1.000000 3
1 BUS553 G200 Business Analytics Quiz: Module 02 0.400000 8
2 BUS553 G200 Business Analytics Quiz: Module 02 0.800000 4
3 BUS553 G200 Business Analytics Quiz: Module 02 0.800000 3
4 BUS553 G200 Business Analytics Quiz: Module 02 1.000000 7
... ... ... ... ...
142 BUS553 G100 Business Analytics Quiz: Module 05 0.833333 3
143 BUS553 G100 Business Analytics Quiz: Module 05 0.625000 5
144 BUS553 G100 Business Analytics Quiz: Module 05 0.777778 9
145 BUS553 G100 Business Analytics Quiz: Module 05 0.666667 10
146 BUS553 G100 Business Analytics Quiz: Module 05 0.888889 7

147 rows × 4 columns

Use GroupBy method to group by quiz

In [231]:
# group by quiz title ignoring course
quizagg = quizzes.groupby(["QuizTitle"])
In [232]:
# use a formatting dictionary to get the number formats looking reasonable for the describe() summary
format_dict = {"count": "{:.0f}", "mean": "{:.1%}", "std": "{:.1%}", "min": "{:.1%}", "25%": "{:.1%}", "50%": "{:.1%}", "75%": "{:.1%}", "max": "{:.1%}"}
quizagg["Percent"].describe().style.format(format_dict)
Out[232]:
count mean std min 25% 50% 75% max
QuizTitle
Quiz: Module 02 37 78.4% 20.8% 20.0% 60.0% 80.0% 100.0% 100.0%
Quiz: Module 03 37 73.2% 18.1% 40.0% 60.0% 80.0% 80.0% 100.0%
Quiz: Module 04 37 63.8% 15.0% 40.0% 60.0% 60.0% 70.0% 100.0%
Quiz: Module 05 36 73.8% 17.8% 16.7% 66.7% 77.8% 83.3% 100.0%

Show histograms of quiz results

In [233]:
# loop through each quiz name
# print mean and draw a histogram
i = 0
xticks = np.linspace(0,1,11)
xticklabels = pd.DataFrame(xticks).applymap(lambda x: "{:.0%}".format(x))
for group_key, group_value in quizagg:
    print("{}: mean score: {:.1%}".format(group_key, group_value['Percent'].mean()))
    i += 1
    plt.figure(figsize=(10,20))
    ax = plt.subplot(len(quizagg),1,i)
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels[0].values)
    sns.distplot(group_value['Percent'], kde=False, hist=True)
    plt.show()
Quiz: Module 02: mean score: 78.4%
Quiz: Module 03: mean score: 73.2%
Quiz: Module 04: mean score: 63.8%
Quiz: Module 05: mean score: 73.8%

Consider time spent on each quiz

In [234]:
# get rough statistics on time spent
quizagg['TimeSpent'].describe().round(2)
Out[234]:
count mean std min 25% 50% 75% max
QuizTitle
Quiz: Module 02 37.0 6.08 2.45 2.0 4.0 6.0 8.0 10.0
Quiz: Module 03 37.0 6.97 2.32 1.0 6.0 7.0 9.0 10.0
Quiz: Module 04 37.0 7.03 2.35 2.0 6.0 7.0 9.0 10.0
Quiz: Module 05 36.0 6.00 2.68 2.0 3.0 6.0 8.0 10.0
In [235]:
# show the distribution of time spent on each quiz
i = 0
for group_key, group_value in quizagg:
    print("{}: mean time: {:.2f}".format(group_key, group_value['TimeSpent'].mean()))
    i += 1
    plt.figure(figsize=(10,20))
    ax = plt.subplot(len(quizagg),1,i)
    sns.distplot(group_value['TimeSpent'], kde=False, hist=True)
    plt.show()
Quiz: Module 02: mean time: 6.08
Quiz: Module 03: mean time: 6.97
Quiz: Module 04: mean time: 7.03
Quiz: Module 05: mean time: 6.00
In [ ]: