tynbl.github.io

使用Python进行假设检验

import pandas as pd
grades_df = pd.read_csv('grades.csv')
grades_df.head()
student_id assignment1_grade assignment1_submission assignment2_grade assignment2_submission assignment3_grade assignment3_submission assignment4_grade assignment4_submission assignment5_grade assignment5_submission assignment6_grade assignment6_submission
0 B73F2C11-70F0-E37D-8B10-1D20AFED50B1 92.733946 2015-11-02 06:55:34.282000000 83.030552 2015-11-09 02:22:58.938000000 67.164441 2015-11-12 08:58:33.998000000 53.011553 2015-11-16 01:21:24.663000000 47.710398 2015-11-20 13:24:59.692000000 38.168318 2015-11-22 18:31:15.934000000
1 98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1 86.790821 2015-11-29 14:57:44.429000000 86.290821 2015-12-06 17:41:18.449000000 69.772657 2015-12-10 08:54:55.904000000 55.098125 2015-12-13 17:32:30.941000000 49.588313 2015-12-19 23:26:39.285000000 44.629482 2015-12-21 17:07:24.275000000
2 D0F62040-CEB0-904C-F563-2F8620916C4E 85.512541 2016-01-09 05:36:02.389000000 85.512541 2016-01-09 06:39:44.416000000 68.410033 2016-01-15 20:22:45.882000000 54.728026 2016-01-11 12:41:50.749000000 49.255224 2016-01-11 17:31:12.489000000 44.329701 2016-01-17 16:24:42.765000000
3 FFDF2B2C-F514-EF7F-6538-A6A53518E9DC 86.030665 2016-04-30 06:50:39.801000000 68.824532 2016-04-30 17:20:38.727000000 61.942079 2016-05-12 07:47:16.326000000 49.553663 2016-05-07 16:09:20.485000000 49.553663 2016-05-24 12:51:18.016000000 44.598297 2016-05-26 08:09:12.058000000
4 5ECBEEB6-F1CE-80AE-3164-E45E99473FB4 64.813800 2015-12-13 17:06:10.750000000 51.491040 2015-12-14 12:25:12.056000000 41.932832 2015-12-29 14:25:22.594000000 36.929549 2015-12-28 01:29:55.901000000 33.236594 2015-12-29 14:46:06.628000000 33.236594 2016-01-05 01:06:59.546000000
grades_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2315 entries, 0 to 2314
Data columns (total 13 columns):
student_id                2315 non-null object
assignment1_grade         2315 non-null float64
assignment1_submission    2315 non-null object
assignment2_grade         2315 non-null float64
assignment2_submission    2315 non-null object
assignment3_grade         2315 non-null float64
assignment3_submission    2315 non-null object
assignment4_grade         2315 non-null float64
assignment4_submission    2315 non-null object
assignment5_grade         2315 non-null float64
assignment5_submission    2315 non-null object
assignment6_grade         2315 non-null float64
assignment6_submission    2315 non-null object
dtypes: float64(6), object(7)
memory usage: 235.2+ KB
early_submission = grades_df[grades_df['assignment1_submission'] <= '2015-12-31']
late_submission = grades_df[grades_df['assignment1_submission'] > '2015-12-31']
early_submission.mean()
assignment1_grade    74.972741
assignment2_grade    67.252190
assignment3_grade    61.129050
assignment4_grade    54.157620
assignment5_grade    48.634643
assignment6_grade    43.838980
dtype: float64
late_submission.mean()
assignment1_grade    74.017429
assignment2_grade    66.370822
assignment3_grade    60.023244
assignment4_grade    54.058138
assignment5_grade    48.599402
assignment6_grade    43.844384
dtype: float64

scipy.stats.ttest_ind

from scipy import stats
# 使用t检验来比较两个总体是否有显著差异,即:早提交作业的学生与晚提交作业的学生成绩是否有显著差异
# 零假设:早提交的学生总体和晚提交的学生总体没有显著差异
# 备择假设:两个总体有显著差异
# 构造一个与此相关的统计量,如果该统计量非常的大(即已经超过了一定的临界值),即p-value>alpha
# 则可以认为这种差异并不仅仅是由抽样误差带来的,因此我们可以拒绝原假设,认为两个总体有显著差异。
stats.ttest_ind(early_submission['assignment1_grade'], late_submission['assignment1_grade'])
# pvalue=0.16,表示两个总体在assignment1上的成绩没有显著差异的概率是0.16>0.05,小概率事件没有发生,不能拒绝原假设
Ttest_indResult(statistic=1.400549944897566, pvalue=0.16148283016060577)
stats.ttest_ind(early_submission['assignment2_grade'], late_submission['assignment2_grade'])
Ttest_indResult(statistic=1.3239868220912567, pvalue=0.18563824610067967)
stats.ttest_ind(early_submission['assignment6_grade'], late_submission['assignment6_grade'])
Ttest_indResult(statistic=-0.0097677547576531208, pvalue=0.99220742556985519)