Task: Running a Chi-Square Test of Independence
I’m using the dataset nesarc to response my research question:
are dystymia symtons and the gender in latin people associated with young adults whose suffer the first episode in their adolescence?
Chi-Square Test of Independence
Now I will make the Chi-Square Test of Independence to examining the association between range of age of persons with dystymia symtomps (categorical explanatory) and sex (categorical response).
Ho: there is no relationship bewteen the range of age of persons with dystymia sympstomps with their sex
Ha: there is a relationship bewteen the range of age of persons with dystymia sympstomps with their sex
A chi-square test of independence revealed that there no is relationship bewteen the range of age of persons with dystymia symptomps with their sex, X2 =6.73, 1 df, p=0.08. Then according with the results, I accept the null hypothesys.
The df or degree of freedom we record is the number of levels of the explanatory variable -1. Here the df is 1 nicotine dependence which has 2 levels (df 2-1=1). tworky�d.g
If I had rejected the null hypotesis, a Chi Square test of independence had relevated that there is a relation between the sex and range of age.
Is neccesary to execute a Post hoc comparison by pairs of range of age.
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 1 22:27:59 2015
import pandas
import numpy
import scipy.stats
import seaborn
import matplotlib.pyplot as plt
def ChiSquaredTest(cat_responseVariable, cat_explanatoryVariable, data):
# contingency table of observed counts
ct = pandas.crosstab(data[cat_responseVariable], data[cat_explanatoryVariable], dropna=True)
print (ct)
# column percentages
colsum = ct.sum(axis=0)
colpct = ct/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs = scipy.stats.chi2_contingency(ct)
print (cs)
data = pandas.read_csv("nesarc.csv", low_memory=False) # Import the entire dataset to memory
pandas.set_option('display.max_columns', None)#Set PANDAS to show all columns in DataFrame
pandas.set_option('display.max_rows', None)#Set PANDAS to show all rows in DataFrame
data.columns = map(str.upper, data.columns)# Upper case all DataFrame column names
pandas.set_option('display.float_format', lambda x:'%f'%x)# bug fix for display formats to avoid run time errors
# Ensure each of these columns are numeric
data["AGE"] = data["AGE"].convert_objects(convert_numeric=True) # AGE
data["SEX"] = data["SEX"].convert_objects(convert_numeric=True) # SEX
data["S1Q1C"] = data["S1Q1C"].convert_objects(convert_numeric=True) # Hispanic or latin origin
data["S4CQ5"] = data["S4CQ5"].convert_objects(convert_numeric=True) # age at onset of first episode
# management data "S4CQ5"
data["S4CQ5"] = data["S4CQ5"].replace(69, numpy.nan)
data["S4CQ5"] = data["S4CQ5"].replace(99, numpy.nan)
# Reserch question: the association between the age when someone has dystymia symtons and the gender in latin people
# Refined question: are dystymia symtons and the gender in latin people associated
# with young adults whose suffer the first episode in their adolescence
# subdata to Latin young adults with age between 19 and 35 who suffer the first episodo between 12 and 18 years
sub1 = data[(data['S1Q1C'] == 1) & (data['AGE'] >= 19) & (data['AGE'] <=35 ) & (data['S4CQ5'].fillna(0) >= 12) & (data['S4CQ5'].fillna(0) <= 18)]
#make a copy of my new subsetted data
sub2 = sub1.copy()
# quartile split (use qcut function & ask for 4 groups)
sub2['AGE_GROUP4'] = pandas.qcut(sub2.AGE, 4, labels=["1 = 25%","2 = 50%","3 = 75%","4 = 100%"]) #AGE - categories by quartiles
sub2['AGE_GROUP4_2'] = pandas.cut(sub2.AGE, [18,22,26,30,35]) #AGE - categories by ranges: 19-22, 23-26, 27-30, 31-35
# subset only with my variables
sub3 = sub2[['AGE', 'SEX', 'S1Q1C', 'S4CQ5', 'AGE_GROUP4', 'AGE_GROUP4_2']]
# recoding values for AGE_GROUP4_2 into a new variable, AGE_GROUP4_2
recode2 = {'(18, 22]': 1, '(22, 26]':2, '(26, 30]': 3, '(30, 35]': 4}
sub3["AGE_GROUP4_2"] = sub3["AGE_GROUP4_2"].map(recode2)
# contingency table of observed counts
ChiSquaredTest(cat_responseVariable = 'SEX', cat_explanatoryVariable = 'AGE_GROUP4_2', data = sub3)
sub3["AGE_GROUP4_2"] = sub3["AGE_GROUP4_2"].astype('category') # set variable types
sub3['SEX1'] = sub3['SEX'].convert_objects(convert_numeric=True) # new code for setting variables to numeric:
# graph percent with nicotine dependence within each smoking frequency group
seaborn.factorplot(x="AGE_GROUP4_2", y="SEX", data=sub3, kind="bar", ci=None)
plt.xlabel('Age')
plt.ylabel('Sex')
#ChiSquaredPostHocTest
recode3 = {1: 1, 2: 2}
sub3['COMP1v2']= sub2['AGE_GROUP4_2'].map(recode3)
ChiSquaredTest(cat_responseVariable = 'SEX', cat_explanatoryVariable = 'COMP1v2',
data = sub3)
recode4 = {1: 1, 3: 3}
sub3['COMP1v3']= sub2['AGE_GROUP4_2'].map(recode4)
ChiSquaredTest(cat_responseVariable = 'SEX', cat_explanatoryVariable = 'COMP1v3',
data = sub3)
recode5 = {1: 1, 4: 4}
sub3['COMP1v4']= sub2['AGE_GROUP4_2'].map(recode5)
ChiSquaredTest(cat_responseVariable = 'SEX', cat_explanatoryVariable = 'COMP1v4',
data = sub3)
recode6 = {2: 2, 3: 3}
sub3['COMP2v3']= sub2['AGE_GROUP4_2'].map(recode6)
ChiSquaredTest(cat_responseVariable = 'SEX', cat_explanatoryVariable = 'COMP2v3',
data = sub3)
recode7 = {2: 2, 4: 4}
sub3['COMP2v4']= sub2['AGE_GROUP4_2'].map(recode7)
ChiSquaredTest(cat_responseVariable = 'SEX', cat_explanatoryVariable = 'COMP2v4',
data = sub3)
recode8 = {3: 3, 4: 4}
sub3['COMP3v4']= sub2['AGE_GROUP4_2'].map(recode8)
ChiSquaredTest(cat_responseVariable = 'SEX', cat_explanatoryVariable = 'COMP3v4',
data = sub3)