#data source: Times Higher Education/Wall Street Journal via Kaggle: 
#https://www.kaggle.com/datasets/mylesoneill/world-university-rankings
#methodology


import pandas as pd


college = pd.read_csv('./data/timesData.csv')


college.tail(3)


college.set_index('university_name', inplace = True)


college.head()


college['country'].value_counts()

country
United States of America     659
United Kingdom               300
Germany                      152
Australia                    117
Canada                       108
                            ... 
Unted Kingdom                  1
Cyprus                         1
Unisted States of America      1
Luxembourg                     1
Lithuania                      1
Name: count, Length: 72, dtype: int64


#Find WM
wm = college.index.str.contains('Mary')
college[wm]


college.loc['William & Mary']


college.query('university_name=="William & Mary"')


#What types of variables do we have here?
#Does it make sense to find the average ranking for schools in a particular country?
#There are larger issues if you look at the methodology?
#Suppose we had zip/postal code...what type of variable would that be?


import numpy as np
import matplotlib.pyplot as plt


#Let's generate some fake data 
# How many data points do I want?
n = 1000000

# Set the random seed to make sure we get the same answers
np.random.seed(146)
x1 = np.random.random(size=n)


np.mean(x1)

0.5003746329139543


np.median(x1)

0.5008503639428961


#Let's generate more fake data
a = 0.1
b = 0.1
n = 1000000
np.random.seed(100)
x2 = np.random.beta(a,b,size=n)


np.mean(x2)

0.49955765208281727


np.median(x2)

0.49741088287963087


plt.hist(x1, rwidth = 0.75, bins=12) #rwidth: relative width of the bars as a fraction of the bin width
ax = plt.gca() #get the current axis
[ax.spines[i].set_visible(False) for i in ax.spines] # An axis spine -- the line noting the data area boundaries.
plt.tick_params(labelsize = 12, size = 5.5)
plt.ylabel('Frequency', fontsize = 14)
plt.xlabel('$x_1$', fontsize = 14)
plt.show()


plt.hist(x2, rwidth = 0.95)
ax = plt.gca()
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.tick_params(labelsize = 12, size = 0)
plt.ylabel('Frequency', fontsize = 14)
plt.xlabel('$x_2$', fontsize = 14)
plt.show()


college_2016 = college[college['year'] == 2016]


college_2016.head()


college_2016['num_students'].describe()

count        793
unique       791
top       23,321
freq           2
Name: num_students, dtype: object


#Why is this happening????? Do you see the problem?
college_2016.dtypes

world_rank                 object
country                    object
teaching                  float64
international              object
research                  float64
citations                 float64
income                     object
total_score                object
num_students               object
student_staff_ratio       float64
international_students     object
female_male_ratio          object
year                        int64
dtype: object


#Let's change this in the full data set and then subset again
students = college['num_students'].str.replace(',','').astype('float')


college['num_students'] = students


college.head(3)


college_2016 = college[college['year'] == 2016]


college_2016.dtypes

world_rank                 object
country                    object
teaching                  float64
international              object
research                  float64
citations                 float64
income                     object
total_score                object
num_students              float64
student_staff_ratio       float64
international_students     object
female_male_ratio          object
year                        int64
dtype: object


college_2016['num_students'].mean()

24161.264817150062


college_2016['num_students'].median()

20174.0


college_2016['num_students'].isnull()

university_name
California Institute of Technology       False
University of Oxford                     False
Stanford University                      False
University of Cambridge                  False
Massachusetts Institute of Technology    False
                                         ...  
Yeungnam University                      False
Yıldız Technical University              False
Yokohama City University                 False
Yokohama National University             False
Yuan Ze University                       False
Name: num_students, Length: 800, dtype: bool


college_2016['num_students'].isnull().value_counts()

num_students
False    793
True       7
Name: count, dtype: int64


college_2016['num_students'].describe()

count       793.000000
mean      24161.264817
std       22569.224842
min         462.000000
25%       12331.000000
50%       20174.000000
75%       29700.000000
max      379231.000000
Name: num_students, dtype: float64


college_2016['num_students'].info()

<class 'pandas.core.series.Series'>
Index: 800 entries, California Institute of Technology to Yuan Ze University
Series name: num_students
Non-Null Count  Dtype  
--------------  -----  
793 non-null    float64
dtypes: float64(1)
memory usage: 44.8+ KB


#What happens to missing values?
#We might want to mod the bin size here
n_bins = 15
min_val = college_2016['num_students'].min()
max_val = college_2016['num_students'].max()
bins = np.linspace(min_val, max_val, n_bins+1)
#notice we are making the plot differently here
#using a Series method


ax = college_2016['num_students'].hist(rwidth = 0.95, bins = bins)
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.ylabel('Frequency', fontsize = 14)
plt.xlabel('Number of Students', fontsize = 14)
plt.grid(None)
plt.show()


n_bins = 10
min_val = np.log10(college_2016['num_students']).min()
max_val = np.log10(college_2016['num_students']).max()
bins = np.linspace(1, max_val, n_bins+1)
#notice we are making the plot differently here
#using a Series method
"""
ax = np.log10(college_2016['num_students']).hist(rwidth = 0.95, bins = bins)
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.ylabel('Frequency', fontsize = 14)
plt.xlabel('$log_{10}$'+' Number of Students', fontsize = 14)
plt.grid(None)
plt.show()
"""
print(bins)

[1.         1.45789038 1.91578077 2.37367115 2.83156153 3.28945192
 3.7473423  4.20523268 4.66312306 5.12101345 5.57890383]


np.var(college_2016['num_students'])
#college_2016['num_students'].var()

508727577.1606413


college_2016.info()

<class 'pandas.core.frame.DataFrame'>
Index: 800 entries, California Institute of Technology to Yuan Ze University
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              800 non-null    object 
 1   country                 800 non-null    object 
 2   teaching                800 non-null    float64
 3   international           800 non-null    object 
 4   research                800 non-null    float64
 5   citations               800 non-null    float64
 6   income                  800 non-null    object 
 7   total_score             800 non-null    object 
 8   num_students            793 non-null    float64
 9   student_staff_ratio     793 non-null    float64
 10  international_students  790 non-null    object 
 11  female_male_ratio       739 non-null    object 
 12  year                    800 non-null    int64  
dtypes: float64(5), int64(1), object(7)
memory usage: 119.8+ KB


fil_college_2016 = college_2016[college_2016['num_students'].isnull()==False]


fil_college_2016.info()

<class 'pandas.core.frame.DataFrame'>
Index: 793 entries, California Institute of Technology to Yuan Ze University
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              793 non-null    object 
 1   country                 793 non-null    object 
 2   teaching                793 non-null    float64
 3   international           793 non-null    object 
 4   research                793 non-null    float64
 5   citations               793 non-null    float64
 6   income                  793 non-null    object 
 7   total_score             793 non-null    object 
 8   num_students            793 non-null    float64
 9   student_staff_ratio     793 non-null    float64
 10  international_students  790 non-null    object 
 11  female_male_ratio       739 non-null    object 
 12  year                    793 non-null    int64  
dtypes: float64(5), int64(1), object(7)
memory usage: 86.7+ KB


np.var(college_2016['num_students'])

508727577.1606413


np.var(fil_college_2016['num_students'])

508727577.1606413


college_2016['num_students'].var()

509369909.9600865


fil_college_2016['num_students'].var()

509369909.9600865


fil_college_2016['num_students'].var()*(len(fil_college_2016['num_students'])-1)/len(fil_college_2016['num_students'])

508727577.1606413


college_2016['num_students'].var()*(len(college_2016['num_students'])-1)/len(college_2016['num_students'])

508733197.5726364


# (referred to the above) why is that?


np.std(college_2016['num_students'])

22554.990072279823


np.std(college_2016['num_students'])/np.mean(college_2016['num_students'])

0.9335185985904977


q = np.quantile(college_2016['num_students'].values, [0.25, 0.5, 0.75])


print(q)

[nan nan nan]


college_2016['num_students'].describe()

count       793.000000
mean      24161.264817
std       22569.224842
min         462.000000
25%       12331.000000
50%       20174.000000
75%       29700.000000
max      379231.000000
Name: num_students, dtype: float64


#What is going on????
college_2016['num_students'].isnull().sum()

7


students = college_2016['num_students'].dropna()


students

university_name
California Institute of Technology        2243.0
University of Oxford                     19919.0
Stanford University                      15596.0
University of Cambridge                  18812.0
Massachusetts Institute of Technology    11074.0
                                          ...   
Yeungnam University                      21958.0
Yıldız Technical University              31268.0
Yokohama City University                  4122.0
Yokohama National University             10117.0
Yuan Ze University                        8663.0
Name: num_students, Length: 793, dtype: float64


q = np.quantile(students, [0.25, 0.5, 0.75])


print(q)
#What does the second number in this list correspond to?

[12331. 20174. 29700.]


q = np.quantile(college_2016['citations'].dropna(), [0.25, 0.5, 0.75, 0.93])

q

array([27.525, 50.3  , 74.9  , 91.507])


plt.figure(figsize=(4,4))
plt.boxplot(college_2016['citations'].dropna())
ax = plt.gca()
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.xticks([])
plt.yticks(fontsize = 14)
plt.grid(linestyle = '--', alpha = 0.5)
plt.title('Citations in 2016', fontsize=14)
plt.show()


college_2016['num_students'].tail()

university_name
Yeungnam University             21958.0
Yıldız Technical University     31268.0
Yokohama City University         4122.0
Yokohama National University    10117.0
Yuan Ze University               8663.0
Name: num_students, dtype: float64


plt.figure(figsize=(4,4))
plt.boxplot(college_2016['num_students'].dropna())
ax = plt.gca()
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.xticks([])
plt.yticks(fontsize = 14)
plt.grid(linestyle = '--', alpha = 0.5)
plt.title('Number of students in 2016', fontsize=14)
plt.show()


idx_canada = college['country'] == 'Canada'
college_ca = college[idx_canada]


import seaborn as sns
plt.figure(figsize=(12,6))
sns.boxplot(x='year', y='num_students', data = college_ca, palette = 'BuPu') #palette = 'BuPu'
plt.ylabel('Number of students', fontsize=14)
plt.xlabel('')
plt.show()

# https://medium.com/analytics-vidhya/deep-dive-into-seaborn-palettes-7b5fae5a258e


plt.figure(figsize = [12,6])
ax = plt.gca()
[ax.spines[i].set_visible(False) for i in ax.spines]
sns.stripplot(x = 'year', y = 'num_students', data = college_ca, size = 10, alpha = 0.5,
             jitter = True)
plt.ylabel('Number of Students', labelpad = 20, fontsize = 14)
plt.xlabel('Year', labelpad = 20, fontsize = 14)
plt.tick_params(labelsize = 12)
plt.grid(axis = 'y', linestyle = '--', alpha = 0.5)
plt.show()

	world_rank	university_name	country	teaching	international	research	citations	income	total_score	num_students	student_staff_ratio	international_students	female_male_ratio	year
2600	601-800	Yokohama City University	Japan	24.0	16.1	10.2	36.4	37.9	-	4,122	3.7	3%	NaN	2016
2601	601-800	Yokohama National University	Japan	20.1	23.3	16.0	13.5	40.4	-	10,117	12.1	8%	28 : 72	2016
2602	601-800	Yuan Ze University	Taiwan	16.2	17.7	18.3	28.6	39.8	-	8,663	20.6	4%	43 : 57	2016

	world_rank	country	teaching	international	research	citations	income	total_score	num_students	student_staff_ratio	international_students	female_male_ratio	year
university_name
Harvard University	1	United States of America	99.7	72.4	98.7	98.8	34.5	96.1	20,152	8.9	25%	NaN	2011
California Institute of Technology	2	United States of America	97.7	54.6	98.0	99.9	83.7	96.0	2,243	6.9	27%	33 : 67	2011
Massachusetts Institute of Technology	3	United States of America	97.8	82.3	91.4	99.9	87.5	95.6	11,074	9.0	33%	37 : 63	2011
Stanford University	4	United States of America	98.3	29.5	98.1	99.2	64.3	94.3	15,596	7.8	22%	42 : 58	2011
Princeton University	5	United States of America	90.9	70.3	95.4	99.9	-	94.2	7,929	8.4	27%	45 : 55	2011

	world_rank	country	teaching	international	research	citations	income	total_score	num_students	student_staff_ratio	international_students	female_male_ratio	year
university_name
William & Mary	75	United States of America	53.1	20.9	36.1	95.6	-	60.4	7,867	11.8	7%	54 : 46	2011
University of Maryland, College Park	98	United States of America	45.4	35.4	48.6	79.2	-	57.2	31,331	8.4	9%	48 : 52	2011
Queen Mary University of London	120	United Kingdom	39.7	91.0	44.1	73.5	38.9	54.6	14,260	14.0	40%	52 : 48	2011
University of Maryland, College Park	94	United States of America	41.1	35.3	43.6	85.8	28.5	54.5	31,331	8.4	9%	48 : 52	2012
Queen Mary University of London	127	United Kingdom	29.0	88.8	28.6	83.7	36.6	49.9	14,260	14.0	40%	52 : 48	2012
William & Mary	146	United States of America	40.0	19.7	18.9	90.7	27.0	47.0	7,867	11.8	7%	54 : 46	2012
University of Maryland, Baltimore County	301-350	United States of America	17.6	18.6	14.5	52.8	29.9	-	13,908	18.1	7%	46 : 54	2012
University of Maryland, College Park	97	United States of America	44.9	40.4	51.4	83.9	32.1	57.9	31,331	8.4	9%	48 : 52	2013
Queen Mary University of London	145	United Kingdom	33.1	87.0	30.0	85.4	42.7	52.1	14,260	14.0	40%	52 : 48	2013
William & Mary	184	United States of America	41.3	25.8	22.6	87.1	29.9	48.0	7,867	11.8	7%	54 : 46	2013
University of Maryland, Baltimore County	301-350	United States of America	19.4	32.6	16.4	65.2	32.6	-	13,908	18.1	7%	46 : 54	2013
University of Maryland, College Park	108	United States of America	39.0	42.4	37.2	84.4	32.1	52.2	31,331	8.4	9%	48 : 52	2014
Queen Mary University of London	114	United Kingdom	31.0	88.0	29.2	87.0	37.7	51.7	14,260	14.0	40%	52 : 48	2014
William & Mary	201-225	United States of America	37.2	26.0	18.5	73.6	31.9	-	7,867	11.8	7%	54 : 46	2014
University of Maryland, Baltimore County	351-400	United States of America	16.9	34.4	15.0	65.9	32.6	-	13,908	18.1	7%	46 : 54	2014
Queen Mary University of London	107	United Kingdom	32.4	88.6	32.9	88.9	37.1	53.8	14,260	14.0	40%	52 : 48	2015
University of Maryland, College Park	132	United States of America	36.5	44.8	39.1	83.6	33.2	51.9	31,331	8.4	9%	48 : 52	2015
William & Mary	201-225	United States of America	36.8	26.5	19.9	78.7	30.0	-	7,867	11.8	7%	54 : 46	2015
Queen Mary University of London	98	United Kingdom	34.1	93.5	41.3	93.3	36.8	58.5	14,260	14.0	40%	52 : 48	2016
University of Maryland, College Park	117	United States of America	45.0	43.5	42.1	88.2	32.3	56.7	31,331	8.4	9%	48 : 52	2016
William & Mary	201-250	United States of America	38.5	26.6	17.5	85.0	29.1	-	7,867	11.8	7%	54 : 46	2016
University of Maryland, Baltimore County	401-500	United States of America	21.3	28.2	18.2	61.3	31.7	-	13,908	18.1	7%	46 : 54	2016

	world_rank	country	teaching	international	research	citations	income	total_score	num_students	student_staff_ratio	international_students	female_male_ratio	year
university_name
William & Mary	75	United States of America	53.1	20.9	36.1	95.6	-	60.4	7,867	11.8	7%	54 : 46	2011
William & Mary	146	United States of America	40.0	19.7	18.9	90.7	27.0	47.0	7,867	11.8	7%	54 : 46	2012
William & Mary	184	United States of America	41.3	25.8	22.6	87.1	29.9	48.0	7,867	11.8	7%	54 : 46	2013
William & Mary	201-225	United States of America	37.2	26.0	18.5	73.6	31.9	-	7,867	11.8	7%	54 : 46	2014
William & Mary	201-225	United States of America	36.8	26.5	19.9	78.7	30.0	-	7,867	11.8	7%	54 : 46	2015
William & Mary	201-250	United States of America	38.5	26.6	17.5	85.0	29.1	-	7,867	11.8	7%	54 : 46	2016

	world_rank	country	teaching	international	research	citations	income	total_score	num_students	student_staff_ratio	international_students	female_male_ratio	year
university_name
William & Mary	75	United States of America	53.1	20.9	36.1	95.6	-	60.4	7,867	11.8	7%	54 : 46	2011
William & Mary	146	United States of America	40.0	19.7	18.9	90.7	27.0	47.0	7,867	11.8	7%	54 : 46	2012
William & Mary	184	United States of America	41.3	25.8	22.6	87.1	29.9	48.0	7,867	11.8	7%	54 : 46	2013
William & Mary	201-225	United States of America	37.2	26.0	18.5	73.6	31.9	-	7,867	11.8	7%	54 : 46	2014
William & Mary	201-225	United States of America	36.8	26.5	19.9	78.7	30.0	-	7,867	11.8	7%	54 : 46	2015
William & Mary	201-250	United States of America	38.5	26.6	17.5	85.0	29.1	-	7,867	11.8	7%	54 : 46	2016

DATA 201¶

Describing Univariate Data with Statistics and Plots¶

Summary statistics¶

Measures of central tendency and histograms¶

Measures of variation (spread)¶

Quartiles and Box Plots¶

Looking at distributions over a category¶