import pandas as pd
grades_df = pd.read_csv('grades.csv')
grades_df.head()
student_id | assignment1_grade | assignment1_submission | assignment2_grade | assignment2_submission | assignment3_grade | assignment3_submission | assignment4_grade | assignment4_submission | assignment5_grade | assignment5_submission | assignment6_grade | assignment6_submission | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B73F2C11-70F0-E37D-8B10-1D20AFED50B1 | 92.733946 | 2015-11-02 06:55:34.282000000 | 83.030552 | 2015-11-09 02:22:58.938000000 | 67.164441 | 2015-11-12 08:58:33.998000000 | 53.011553 | 2015-11-16 01:21:24.663000000 | 47.710398 | 2015-11-20 13:24:59.692000000 | 38.168318 | 2015-11-22 18:31:15.934000000 |
1 | 98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1 | 86.790821 | 2015-11-29 14:57:44.429000000 | 86.290821 | 2015-12-06 17:41:18.449000000 | 69.772657 | 2015-12-10 08:54:55.904000000 | 55.098125 | 2015-12-13 17:32:30.941000000 | 49.588313 | 2015-12-19 23:26:39.285000000 | 44.629482 | 2015-12-21 17:07:24.275000000 |
2 | D0F62040-CEB0-904C-F563-2F8620916C4E | 85.512541 | 2016-01-09 05:36:02.389000000 | 85.512541 | 2016-01-09 06:39:44.416000000 | 68.410033 | 2016-01-15 20:22:45.882000000 | 54.728026 | 2016-01-11 12:41:50.749000000 | 49.255224 | 2016-01-11 17:31:12.489000000 | 44.329701 | 2016-01-17 16:24:42.765000000 |
3 | FFDF2B2C-F514-EF7F-6538-A6A53518E9DC | 86.030665 | 2016-04-30 06:50:39.801000000 | 68.824532 | 2016-04-30 17:20:38.727000000 | 61.942079 | 2016-05-12 07:47:16.326000000 | 49.553663 | 2016-05-07 16:09:20.485000000 | 49.553663 | 2016-05-24 12:51:18.016000000 | 44.598297 | 2016-05-26 08:09:12.058000000 |
4 | 5ECBEEB6-F1CE-80AE-3164-E45E99473FB4 | 64.813800 | 2015-12-13 17:06:10.750000000 | 51.491040 | 2015-12-14 12:25:12.056000000 | 41.932832 | 2015-12-29 14:25:22.594000000 | 36.929549 | 2015-12-28 01:29:55.901000000 | 33.236594 | 2015-12-29 14:46:06.628000000 | 33.236594 | 2016-01-05 01:06:59.546000000 |
grades_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2315 entries, 0 to 2314
Data columns (total 13 columns):
student_id 2315 non-null object
assignment1_grade 2315 non-null float64
assignment1_submission 2315 non-null object
assignment2_grade 2315 non-null float64
assignment2_submission 2315 non-null object
assignment3_grade 2315 non-null float64
assignment3_submission 2315 non-null object
assignment4_grade 2315 non-null float64
assignment4_submission 2315 non-null object
assignment5_grade 2315 non-null float64
assignment5_submission 2315 non-null object
assignment6_grade 2315 non-null float64
assignment6_submission 2315 non-null object
dtypes: float64(6), object(7)
memory usage: 235.2+ KB
early_submission = grades_df[grades_df['assignment1_submission'] <= '2015-12-31']
late_submission = grades_df[grades_df['assignment1_submission'] > '2015-12-31']
early_submission.mean()
assignment1_grade 74.972741
assignment2_grade 67.252190
assignment3_grade 61.129050
assignment4_grade 54.157620
assignment5_grade 48.634643
assignment6_grade 43.838980
dtype: float64
late_submission.mean()
assignment1_grade 74.017429
assignment2_grade 66.370822
assignment3_grade 60.023244
assignment4_grade 54.058138
assignment5_grade 48.599402
assignment6_grade 43.844384
dtype: float64
from scipy import stats
# 使用t检验来比较两个总体是否有显著差异,即:早提交作业的学生与晚提交作业的学生成绩是否有显著差异
# 零假设:早提交的学生总体和晚提交的学生总体没有显著差异
# 备择假设:两个总体有显著差异
# 构造一个与此相关的统计量,如果该统计量非常的大(即已经超过了一定的临界值),即p-value>alpha
# 则可以认为这种差异并不仅仅是由抽样误差带来的,因此我们可以拒绝原假设,认为两个总体有显著差异。
stats.ttest_ind(early_submission['assignment1_grade'], late_submission['assignment1_grade'])
# pvalue=0.16,表示两个总体在assignment1上的成绩没有显著差异的概率是0.16>0.05,小概率事件没有发生,不能拒绝原假设
Ttest_indResult(statistic=1.400549944897566, pvalue=0.16148283016060577)
stats.ttest_ind(early_submission['assignment2_grade'], late_submission['assignment2_grade'])
Ttest_indResult(statistic=1.3239868220912567, pvalue=0.18563824610067967)
stats.ttest_ind(early_submission['assignment6_grade'], late_submission['assignment6_grade'])
Ttest_indResult(statistic=-0.0097677547576531208, pvalue=0.99220742556985519)