# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from warnings import filterwarnings

filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline


# Dict of all the needed columns
colnames = {
    "STIDSTD": "stud_id",
    "NC": "country",
    "ST04Q01": "gender",
    "ST05Q01": "pri_sch",
    "ST06Q01": "age_at_pri_sch",
    "ST01Q01": "intl_grade",
    "ST08Q01": "late_for_school",
    "ST09Q01": "skip_day",
    "ST115Q01": "skip_class",
    "ST26Q02": "possess_room",
    "ST26Q03": "possess_study_place",
    "ST26Q04": "possess_computer",
    "ST26Q06": "possess_internet",
    "ST26Q10": "possess_textbook",
    "ST29Q06": "math_interest",
    "ST42Q01": "math_anxiety",
    "MATBEH": "math_behaviour",
    "PV1MATH": "math_score",
    "PV1READ": "read_score",
    "PV1SCIE": "science_score",
    "ST44Q03": "failure_attr",
    "ST13Q01": "mother_sch_lvl",
    "ST17Q01": "father_sch_lvl",
    "ST15Q01": "mother_job_status",
    "ST19Q01": "father_job_status",
    "TCHBEHFA": "teacher_behaviour",
    "TEACHSUP": "teacher_support",
    "BFMJ2": "father_earning_pct",
    "BMMJ1": "mother_earning_pct"
}


# Loading the pisa dataset
pisa_df = pd.read_csv('pisa2012/pisa2012.csv', encoding='ANSI', nrows=2e5, usecols=colnames.keys(), low_memory=False)


pisa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 29 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   NC        200000 non-null  object 
 1   STIDSTD   200000 non-null  int64  
 2   ST01Q01   200000 non-null  int64  
 3   ST04Q01   200000 non-null  object 
 4   ST05Q01   194537 non-null  object 
 5   ST06Q01   184727 non-null  float64
 6   ST08Q01   196367 non-null  object 
 7   ST09Q01   196385 non-null  object 
 8   ST115Q01  196481 non-null  float64
 9   ST13Q01   186541 non-null  object 
 10  ST15Q01   191405 non-null  object 
 11  ST17Q01   179712 non-null  object 
 12  ST19Q01   184470 non-null  object 
 13  ST26Q02   194627 non-null  object 
 14  ST26Q03   193172 non-null  object 
 15  ST26Q04   194277 non-null  object 
 16  ST26Q06   194110 non-null  object 
 17  ST26Q10   192970 non-null  object 
 18  ST29Q06   128594 non-null  object 
 19  ST42Q01   127967 non-null  object 
 20  ST44Q03   127963 non-null  object 
 21  BFMJ2     169195 non-null  float64
 22  BMMJ1     157717 non-null  float64
 23  MATBEH    128231 non-null  float64
 24  TCHBEHFA  128395 non-null  float64
 25  TEACHSUP  129264 non-null  float64
 26  PV1MATH   200000 non-null  float64
 27  PV1READ   200000 non-null  float64
 28  PV1SCIE   200000 non-null  float64
dtypes: float64(10), int64(2), object(17)
memory usage: 44.3+ MB


# Top 5 rows of the dataset
pisa_df.head()


pisa_df.shape

(200000, 29)


# Renaming all columns to their right name
pisa_df.rename(columns=colnames, inplace=True)


# Top 5 rows of the sliced columns
pisa_df.loc[:4, :'father_job_status']


# Top 5 rows of the sliced columns
pisa_df.loc[:4, 'possess_room':]


pisa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   country              200000 non-null  object 
 1   stud_id              200000 non-null  int64  
 2   intl_grade           200000 non-null  int64  
 3   gender               200000 non-null  object 
 4   pri_sch              194537 non-null  object 
 5   age_at_pri_sch       184727 non-null  float64
 6   late_for_school      196367 non-null  object 
 7   skip_day             196385 non-null  object 
 8   skip_class           196481 non-null  float64
 9   mother_sch_lvl       186541 non-null  object 
 10  mother_job_status    191405 non-null  object 
 11  father_sch_lvl       179712 non-null  object 
 12  father_job_status    184470 non-null  object 
 13  possess_room         194627 non-null  object 
 14  possess_study_place  193172 non-null  object 
 15  possess_computer     194277 non-null  object 
 16  possess_internet     194110 non-null  object 
 17  possess_textbook     192970 non-null  object 
 18  math_interest        128594 non-null  object 
 19  math_anxiety         127967 non-null  object 
 20  failure_attr         127963 non-null  object 
 21  father_earning_pct   169195 non-null  float64
 22  mother_earning_pct   157717 non-null  float64
 23  math_behaviour       128231 non-null  float64
 24  teacher_behaviour    128395 non-null  float64
 25  teacher_support      129264 non-null  float64
 26  math_score           200000 non-null  float64
 27  read_score           200000 non-null  float64
 28  science_score        200000 non-null  float64
dtypes: float64(10), int64(2), object(17)
memory usage: 44.3+ MB


# Fix student id type i.e., change the type to an object
pisa_df['stud_id'] = pisa_df['stud_id'].astype('str')


# Check student id type after fixed

pisa_df['stud_id'].dtype

dtype('O')


pisa_df['skip_day'].unique()

array(['None  ', nan, 'One or two times  ', 'Three or four times  ',
       'Five or more times  '], dtype=object)


pisa_df['skip_day'] = pisa_df['skip_day'].fillna('None')

# Fixing the trailing spaces in skip_day col
pisa_df['skip_day'] = pisa_df['skip_day'].str.strip()


# After fixed
pisa_df['skip_day'].unique()

array(['None', 'One or two times', 'Three or four times',
       'Five or more times'], dtype=object)


pisa_df['skip_class'].unique()

array([ 1.,  2.,  3., nan,  4.])


pisa_df['country'].unique()

array(['Albania', 'United Arab Emirates ', 'Argentina', 'Australia',
       'Austria', 'Belgium', 'Bulgaria ', 'Brazil ', 'Canada ',
       'Switzerland', 'Chile', 'Colombia ', 'Costa Rica ',
       'Czech Republic ', 'Germany', 'Denmark', 'Spain', 'Estonia',
       'Finland', 'France ', 'United Kingdom (excl.Scotland) ',
       'United Kingdom (Scotland)'], dtype=object)


# Fixing the leading and trailing space
pisa_df['country'] = pisa_df['country'].str.strip()


# After fixed
pisa_df['country'].unique()

array(['Albania', 'United Arab Emirates', 'Argentina', 'Australia',
       'Austria', 'Belgium', 'Bulgaria', 'Brazil', 'Canada',
       'Switzerland', 'Chile', 'Colombia', 'Costa Rica', 'Czech Republic',
       'Germany', 'Denmark', 'Spain', 'Estonia', 'Finland', 'France',
       'United Kingdom (excl.Scotland)', 'United Kingdom (Scotland)'],
      dtype=object)


pisa_df.duplicated().sum()

0


# Parent school level columns
p_schlvl_cols = ['mother_sch_lvl', 'father_sch_lvl']

parent_lvl = pisa_df[p_schlvl_cols]


parent_lvl.head()


def check_unique_psch():
    for col in p_schlvl_cols:
        print(f'{col}: ', pisa_df[col].unique(), end="\n\n")

check_unique_psch()

mother_sch_lvl:  ['<ISCED level 3A> ' '<ISCED level 3B, 3C> '
 'She did not complete <ISCED level 1> ' '<ISCED level 2> '
 '<ISCED level 1> ' nan]

father_sch_lvl:  ['<ISCED level 3A> ' '<ISCED level 3B, 3C> ' '<ISCED level 2> '
 'He did not complete <ISCED level 1> ' nan '<ISCED level 1> ']


def fix_parent_sch_lvl():
    """Fix all redundant characters in parent school level columns"""
    for col in parent_lvl.columns:
        # Fix trailing spaces
        pisa_df.loc[:, col] = parent_lvl[col].str.strip()
        # fixing redundant characters on school level
        plvl = pisa_df[col].str.extract(r'^<(.*)>$|(.*)')
        pisa_df[col] = plvl.apply(lambda cols: 
                                            cols[1] if cols[0] is np.nan
                                                    else cols[0], axis=1)


fix_parent_sch_lvl()


# After fixing parent school level
check_unique_psch()

mother_sch_lvl:  ['ISCED level 3A' 'ISCED level 3B, 3C'
 'She did not complete <ISCED level 1>' 'ISCED level 2' 'ISCED level 1'
 nan]

father_sch_lvl:  ['ISCED level 3A' 'ISCED level 3B, 3C' 'ISCED level 2'
 'He did not complete <ISCED level 1>' nan 'ISCED level 1']


def order_cat(columns: list, order: list =None):
    """Order all columns scale in the right order"""
    if not order:
        order = ['Strongly disagree', 'Disagree', 'Agree', 'Strongly agree']
    for col in columns:
        pisa_df[col] = pd.Categorical(pisa_df[col], categories=order, ordered=True)


sch_lvl = ['She did not complete <ISCED level 1>','He did not complete <ISCED level 1>']
for col, lvl in zip(p_schlvl_cols, sch_lvl):
    order_cat([col], order=[lvl,'ISCED level 1', 'ISCED level 2',
                                    'ISCED level 3A', 'ISCED level 3B, 3C'])


# After correcting the order
for i, col in enumerate(p_schlvl_cols):
    print(f'{col}: ', pisa_df[p_schlvl_cols[i]].dtype, end="\n\n")

mother_sch_lvl:  category

father_sch_lvl:  category


pisa_df['mother_job_status'].unique()

array(['Other (e.g. home duties, retired) ',
       'Working full-time <for pay> ', 'Working part-time <for pay>',
       'Not working, but looking for a job ', nan], dtype=object)


job_status = ['mother_job_status', 'father_job_status']
for jbs in job_status:
    pisa_df[jbs] = pisa_df[jbs].str.strip()
    
order_cat(job_status, order=['Not working, but looking for a job', 
                            'Other (e.g. home duties, retired)', 'Working part-time <for pay>', 
                            'Working full-time <for pay>'])


pisa_df['pri_sch'].unique()

array(['No  ', 'Yes, for more than one year',
       'Yes, for one year or less ', nan], dtype=object)


# Fixing trailing spaces
pisa_df['pri_sch'] = pisa_df['pri_sch'].str.strip()


# Unique elements after fixed
pisa_df['pri_sch'].unique()

array(['No', 'Yes, for more than one year', 'Yes, for one year or less',
       nan], dtype=object)


order_cat(['pri_sch'], 
            order=['No', 'Yes, for one year or less', 'Yes, for more than one year'])


# After student experience in primary school has been ordered
pisa_df['pri_sch'].dtype

CategoricalDtype(categories=['No', 'Yes, for one year or less',
                  'Yes, for more than one year'],
, ordered=True)


possession = pisa_df.columns[pisa_df.columns.str.startswith('possess')]
pisa_df[possession].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   possess_room         194627 non-null  object
 1   possess_study_place  193172 non-null  object
 2   possess_computer     194277 non-null  object
 3   possess_internet     194110 non-null  object
 4   possess_textbook     192970 non-null  object
dtypes: object(5)
memory usage: 7.6+ MB


pisa_df[possession].head(2)


order_cat(possession, order=['No', 'Yes'])


pisa_df[possession].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   possess_room         194627 non-null  category
 1   possess_study_place  193172 non-null  category
 2   possess_computer     194277 non-null  category
 3   possess_internet     194110 non-null  category
 4   possess_textbook     192970 non-null  category
dtypes: category(5)
memory usage: 977.3 KB


# making gender to be of category type
pisa_df['gender'] = pisa_df['gender'].astype('category')


math_related = pisa_df.columns[pisa_df.columns.str.startswith('math')]

pisa_df[math_related].head()


for col in math_related[:2]:
    print(f'{col}: ', pisa_df[col].unique())

math_interest:  ['Agree' 'Strongly agree' nan 'Disagree' 'Strongly disagree']
math_anxiety:  ['Agree' nan 'Strongly agree' 'Disagree' 'Strongly disagree']


order_cat(math_related[:2])


pisa_df[math_related[1]].dtype

CategoricalDtype(categories=['Strongly disagree', 'Disagree', 'Agree', 'Strongly agree'], ordered=True)


order_cat(['failure_attr'], order=['Not at all likely', 'Slightly likely', 'Likely', 'Very   Likely'])


# after order
pisa_df['failure_attr'].dtype

CategoricalDtype(categories=['Not at all likely', 'Slightly likely', 'Likely',
                  'Very   Likely'],
, ordered=True)


pisa_df.describe()


pisa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   country              200000 non-null  object  
 1   stud_id              200000 non-null  object  
 2   intl_grade           200000 non-null  int64   
 3   gender               200000 non-null  category
 4   pri_sch              194537 non-null  category
 5   age_at_pri_sch       184727 non-null  float64 
 6   late_for_school      196367 non-null  object  
 7   skip_day             200000 non-null  object  
 8   skip_class           196481 non-null  float64 
 9   mother_sch_lvl       186541 non-null  category
 10  mother_job_status    191405 non-null  category
 11  father_sch_lvl       179712 non-null  category
 12  father_job_status    184470 non-null  category
 13  possess_room         194627 non-null  category
 14  possess_study_place  193172 non-null  category
 15  possess_computer     194277 non-null  category
 16  possess_internet     194110 non-null  category
 17  possess_textbook     192970 non-null  category
 18  math_interest        128594 non-null  category
 19  math_anxiety         127967 non-null  category
 20  failure_attr         127963 non-null  category
 21  father_earning_pct   169195 non-null  float64 
 22  mother_earning_pct   157717 non-null  float64 
 23  math_behaviour       128231 non-null  float64 
 24  teacher_behaviour    128395 non-null  float64 
 25  teacher_support      129264 non-null  float64 
 26  math_score           200000 non-null  float64 
 27  read_score           200000 non-null  float64 
 28  science_score        200000 non-null  float64 
dtypes: category(14), float64(10), int64(1), object(4)
memory usage: 25.6+ MB


pisa_df.to_csv('process_data.csv')


pisa_df.head()


from typing import Iterable

def count_dplot(cols: Iterable, plottype: sb=sb.countplot, nrows: int=1, ncols=2, usex: bool=True,
                 figsize: tuple=(16, 7), annotr=0, tick_angle=90):
    """
        This function will plot the number of occurences in the given columns using count chart and annotate each bar
        using k to represent thousand.
        The number of the subplots rows is default to 1 while columns is 2, this can be modified using the nrows and ncols attributes.
        usex is default to True, this is to show vertically or horizontally.
    """
    base_color = sb.color_palette()[0]
    fig, ax = plt.subplots(figsize=figsize)
    if usex:
        for i, x in enumerate(cols, 1):
            countv = pisa_df[x].value_counts()
            plt.subplot(nrows, ncols, i) # subplot
            g = plottype(data=pisa_df, x=x, color=base_color, order=countv.index) # Plotting
            plt.xlabel(g.get_xlabel().replace("_", " ").title(), fontweight='bold') # Capitalize and bold the xlabel
            locs, labels = plt.xticks(rotation=tick_angle) # Rotate xticks
            for loc, label in zip(locs, labels):
                count = countv[label.get_text()] # Get the count of each label
                plt.text(loc, count+6e2, f'{round(count/1000, 1)}k',
                         ha='center', color='black', rotation=annotr) # Annotate each bar with their count using k to denote 1000
            g.set_yticks([])
            for a in ('right', 'left', 'top'):
                g.spines[a].set_visible(False) # Turn off the chosen spines
        plt.tight_layout() # To avoid subplot overlapping
    else:
        g = plottype(data=pisa_df, y=cols, color=base_color, ax=ax)
        ax.bar_label(ax.containers[0], label_type='edge', rotation=annotr) # Annotate all bars


count_dplot(['country', 'gender'], annotr=90)


count_dplot('pri_sch', usex=False, figsize=(9, 5))
plt.ylabel('Attend Primary school');


pisa_df['age_at_pri_sch'].describe()

count    184727.000000
mean          6.053138
std           1.186729
min           4.000000
25%           5.000000
50%           6.000000
75%           7.000000
max          16.000000
Name: age_at_pri_sch, dtype: float64


bin_size = np.arange(4, 17, 1)
g = sb.histplot(data=pisa_df, x='age_at_pri_sch', bins=bin_size)
g.set_xlabel('Age at Primary school')
g.set_title('Distribution of Age at Primary School');


count_dplot(possession, ncols=3, nrows=2, figsize=(10, 7), tick_angle=0)


count_dplot(p_schlvl_cols, annotr=0)


count_dplot(['father_job_status', 'mother_job_status'])


pisa_df[['father_earning_pct', 'mother_earning_pct']].describe()


bins = np.arange(11, 89, 2)
plt.figure(figsize= (14, 5))
for k, col in enumerate(['father_earning_pct', 'mother_earning_pct'], 1):
    plt.subplot(1, 2, k)
    plt.hist(pisa_df[col], bins=bins, color='b', alpha=0.5)
    plt.xlabel(col.replace("_", " ").title())
plt.ylabel('Frequency')
plt.suptitle('Distribution of Parent Earnings', fontweight='bold');


count_dplot(math_related[:2], figsize=(10, 5))


bins = np.arange(-2, 3, 0.3)
plt.figure(figsize=(15, 4))
plt.subplot(1, 2, 1)
plt.hist(pisa_df['teacher_behaviour'], bins=bins)
plt.title("Teacher's behaviour")
plt.xlabel('Rate')

plt.subplot(1,2,2)
plt.hist(pisa_df['teacher_support'], bins=bins)
plt.title("Teacher's support")
plt.xlabel('Rate');


def score_distribution():
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 6))
    cols = ['math_score', 'read_score', 'science_score']
    for i, ax in enumerate(axes):
        g = sb.histplot(data=pisa_df, x=cols[i], ax=ax)
        g.set_xlabel(g.get_xlabel().replace("_", " ").title())
    plt.suptitle('Student score distribution in different subjects', fontweight='bold')
    plt.tight_layout();

score_distribution()


pisa_df.iloc[:, -3:].describe().round(2)


pisa_df['math_score'].describe()

count    200000.000000
mean        471.075796
std         101.449512
min          19.792800
25%         397.655400
50%         470.252400
75%         543.316700
max         896.798600
Name: math_score, dtype: float64


mathscore_above_25_percent = pisa_df.query("math_score >= 397 & math_score <= 600")
order = mathscore_above_25_percent.groupby('country')['math_score'].quantile(.75).sort_values(ascending=False).index
fig, ax = plt.subplots(figsize=(9, 10))
sb.boxplot(data=mathscore_above_25_percent, y='country', x='math_score', order=order,color=sb.color_palette()[0]);


def possess_score(subject:str= 'math_score', data: pd.DataFrame=mathscore_above_25_percent):
    fig, axis = plt.subplots(nrows=1, ncols=4, figsize=(13, 4))
    subj_cap = subject.replace("_", " ").title()
    for index, ax in enumerate(axis):
        g = sb.barplot(data=data, 
                            x=possession[index], y=subject,
                                ax=ax, color=sb.color_palette()[0])
        
        g.set(xlabel=g.get_xlabel().replace("_", " ").title(),
                ylabel=subj_cap)
        ax.bar_label(ax.containers[0], fmt="%.2f", label_type='edge', padding=2)
    plt.suptitle(f"{subj_cap} vs Student Possession")
    plt.tight_layout()


possess_score()


possess_score('read_score')


possess_score('science_score')


possess_score(data=pisa_df.query('country=="Belgium"'))


possess_score('intl_grade')


def primary_edu_plot(x):
    sb.violinplot(data=pisa_df, y='pri_sch', x=x, inner='quartile', palette='Greens')
    plt.ylabel('Attend Primary Education');
    plt.xlabel(x.replace('_', " ").title())


primary_edu_plot('math_score')


primary_edu_plot('read_score')


primary_edu_plot('science_score')


plt.style.use('seaborn')
def age_influence_plot(score, p='Blues_r'):
    fig, ax = plt.subplots(figsize=(15, 6))
    sb.boxenplot(data=pisa_df, x='age_at_pri_sch', y=score, palette=p, ax=ax)
    plt.xlabel('Age at Primary School')
    plt.ylabel(score.replace('_', " ").title())
    plt.title("Early education influence on Education")


age_influence_plot('math_score')


age_influence_plot('read_score', p='Greens_r')


age_influence_plot('science_score', p='flare_r')


def parent_degree_inf(col):
    """
        Parent degree influence on student performance
    """
    fig, axis = plt.subplots(nrows=2, figsize=(9, 12))
    g = sb.boxplot(data=pisa_df, y=col, x='math_score', palette='viridis_r', ax=axis[0])
    g.set(xlabel=g.get_xlabel().replace("_", " ").title(),
            ylabel=g.get_ylabel().replace("_", " ").title())

    g = sb.pointplot(data=pisa_df, y=col, x='read_score', palette='viridis_r', ax=axis[1])
    g.set(xlabel=g.get_xlabel().replace("_", " ").title(),
            ylabel=g.get_ylabel().replace("_", " ").title())
    plt.suptitle('Effect of parent degree on student edcuation performance')


parent_degree_inf('mother_sch_lvl')


parent_degree_inf('father_sch_lvl')


def parent_emp_inf():
    """
        Parent employment influence on student performance
    """
    fig, axis = plt.subplots(nrows=2, figsize=(9, 12))
    g = sb.violinplot(data=pisa_df, y='father_job_status', x='math_score', inner='quartile', palette='Blues', ax=axis[0])
    g.set(xlabel=g.get_xlabel().replace("_", " ").title(),
            ylabel=g.get_ylabel().replace("_", " ").title())

    g = sb.boxenplot(data=pisa_df, y='mother_job_status', x='read_score', palette='Greens', ax=axis[1])
    g.set(xlabel=g.get_xlabel().replace("_", " ").title(),
            ylabel=g.get_ylabel().replace("_", " ").title())


parent_emp_inf()


sb.regplot(data=pisa_df.sample(5000), x='father_earning_pct', y='math_score', scatter_kws={'alpha': 0.07});


sb.regplot(data=pisa_df.sample(5000), x='mother_earning_pct', y='read_score', scatter_kws={'alpha': 0.1, 'color': 'r'});


sb.regplot(data=pisa_df, x='teacher_behaviour', y='math_score', fit_reg=False, scatter_kws={'alpha': 0.3});


sb.regplot(data=pisa_df, x='teacher_behaviour', y='read_score', fit_reg=False, scatter_kws={'alpha': 0.3});


plt.hist2d(data=pisa_df.dropna(), x='math_behaviour', y='math_score', cmap='Blues', cmin=0.5)
plt.xlim([-2, 3])
plt.xlabel('Student Math Behaviour')
plt.ylabel('Math Score')
plt.colorbar();


def gender_effect_plot(exam_scores):
    plt.figure(figsize=(15, 7))
    for i, score in enumerate(exam_scores, 1):
        plt.subplot(1, 3, i)
        g = sb.violinplot(data=pisa_df, x='gender', y=score, inner='quartile', color='lightgreen')
        g.set(ylabel=score.replace("_", " ").title(), xlabel='Gender')
    plt.suptitle('Students performance in Examination base on their Gender')
    plt.tight_layout()


# Examination score
exam_scores = ['math_score', 'read_score', 'science_score']

gender_effect_plot(exam_scores)


fig, ax = plt.subplots(figsize=(7, 6))
g=sb.countplot(data=pisa_df, x='math_anxiety', hue='gender')
g.set(xlabel='Math anxiety')
ax.bar_label(ax.containers[0], label_type='edge', padding=2) # Adding annotation
ax.bar_label(ax.containers[1], label_type='edge', padding=2) # Adding annotation
plt.legend(title='Gender');


fig, ax = plt.subplots(figsize=(15, 6))
g = sb.barplot(data=pisa_df, y='math_score', x='country', errwidth=0, hue='gender', order=order)
ax.bar_label(ax.containers[0], rotation=90, fmt='%.1f', label_type='center', color='white')
ax.bar_label(ax.containers[1], rotation=90, fmt='%.1f', label_type='center', color='white')
g.set(xlabel="Country", ylabel="Math score")
plt.xticks(rotation=90)
plt.legend(title='Gender');


g = sb.violinplot(data=pisa_df, x='pri_sch', y='math_score', hue='possess_study_place', inner='quartile')
g.set(xlabel="Primary Education", ylabel="Math Score", title="Primary Education and Possession of Study place\non Student Performance")
plt.legend(loc=2, title="Possess Study Place");


mathscore_above_25_percent['age_label'] = pd.cut(mathscore_above_25_percent['age_at_pri_sch'], 
                                                bins=[4, 6, 8, 10, 12, 14, 16], 
                                                labels=["4-6yrs", "6-8yrs", '8-10yrs', "10-12yrs", '12-14yrs', '14-16yrs'])
                                                
fig, ax = plt.subplots(figsize=(15, 7))
g = sb.boxplot(data=mathscore_above_25_percent, x='age_label', y='read_score', hue='mother_sch_lvl',
                     palette='viridis_r', ax=ax)
g.set(xlabel='Student age in Primary Education', 
            ylabel="Read Score", title="Student age in Primary Education & Mother's Education effect\non\n Student Performance")
plt.legend(loc=(1, 0.5), title="Mother's Education");


g = sb.FacetGrid(data=pisa_df, col='gender', aspect=1, height=5)
g.map(sb.boxenplot, 'math_interest', 'math_score', palette='Blues');


fig, ax = plt.subplots(figsize=(10, 7))
sb.heatmap(pisa_df.corr(), cmap='viridis_r', annot=True, fmt='.1f', ax=ax);


g = sb.FacetGrid(data=pisa_df, col='possess_study_place', aspect=1.2, height=7)
g.map(sb.lineplot, 'father_earning_pct', 'math_score');

	math_interest	math_anxiety	math_behaviour	math_score
0	Agree	Agree	0.6426	406.8469
1	Agree	NaN	1.4702	486.1427
2	Strongly agree	NaN	0.9618	533.2684
3	NaN	NaN	NaN	412.2215
4	Strongly agree	Strongly agree	1.8169	381.9209

	intl_grade	age_at_pri_sch	skip_class	father_earning_pct	mother_earning_pct	math_behaviour	teacher_behaviour	teacher_support	math_score	read_score	science_score
count	200000.000000	184727.000000	196481.000000	169195.000000	157717.000000	128231.000000	128395.000000	129264.000000	200000.000000	200000.000000	200000.000000
mean	9.724060	6.053138	1.254223	43.876301	45.086883	0.137802	0.140403	0.180976	471.075796	474.879066	480.629329
std	2.229471	1.186729	0.573415	21.884619	21.995521	1.030937	1.030047	1.003438	101.449512	102.002925	101.774531
min	7.000000	4.000000	1.000000	11.010000	11.010000	-2.140200	-2.391900	-2.920000	19.792800	0.083400	6.844500
25%	9.000000	5.000000	1.000000	25.710000	25.040000	-0.456700	-0.594500	-0.470000	397.655400	407.008400	409.399500
50%	10.000000	6.000000	1.000000	36.350000	43.330000	0.217100	0.250900	0.110000	470.252400	479.131800	481.853800
75%	10.000000	7.000000	1.000000	65.010000	65.420000	0.811000	0.764400	0.970000	543.316700	546.908300	553.002600
max	96.000000	16.000000	4.000000	88.960000	88.960000	4.424900	2.629500	1.680000	896.798600	875.705800	845.897100

	father_earning_pct	mother_earning_pct
count	169195.000000	157717.000000
mean	43.876301	45.086883
std	21.884619	21.995521
min	11.010000	11.010000
25%	25.710000	25.040000
50%	36.350000	43.330000
75%	65.010000	65.420000
max	88.960000	88.960000

	math_score	read_score	science_score
count	200000.00	200000.00	200000.00
mean	471.08	474.88	480.63
std	101.45	102.00	101.77
min	19.79	0.08	6.84
25%	397.66	407.01	409.40
50%	470.25	479.13	481.85
75%	543.32	546.91	553.00
max	896.80	875.71	845.90

	NC	STIDSTD	ST01Q01	ST04Q01	ST05Q01	ST06Q01	ST08Q01	ST09Q01	ST115Q01	ST13Q01	...	ST42Q01	ST44Q03	BFMJ2	BMMJ1	MATBEH	TCHBEHFA	TEACHSUP	PV1MATH	PV1READ	PV1SCIE
0	Albania	1	10	Female	No	6.0	None	None	1.0	<ISCED level 3A>	...	Agree	Slightly likely	76.49	79.74	0.6426	1.3625	1.68	406.8469	249.5762	341.7009
1	Albania	2	10	Female	Yes, for more than one year	7.0	One or two times	None	1.0	<ISCED level 3A>	...	NaN	Slightly likely	15.35	23.47	1.4702	NaN	NaN	486.1427	406.2936	548.9929
2	Albania	3	9	Female	Yes, for more than one year	6.0	None	None	1.0	<ISCED level 3B, 3C>	...	NaN	Likely	22.57	NaN	0.9618	NaN	NaN	533.2684	401.2100	499.6643
3	Albania	4	9	Female	Yes, for more than one year	6.0	None	None	1.0	<ISCED level 3B, 3C>	...	NaN	NaN	14.21	NaN	NaN	0.7644	1.68	412.2215	547.3630	438.6796
4	Albania	5	9	Female	Yes, for more than one year	6.0	One or two times	None	2.0	She did not complete <ISCED level 1>	...	Strongly agree	Likely	80.92	NaN	1.8169	0.7644	0.11	381.9209	311.7707	361.5628

	possess_room	possess_study_place	possess_computer	possess_internet	possess_textbook
0	No	Yes	No	No	Yes
1	Yes	Yes	Yes	Yes	Yes

Student Performance Evaluation Analysis¶

by Olusola Timothy Ogundepo¶

Introduction¶

Preliminary Wrangling¶

What is the structure of your dataset?¶

What is/are the main feature(s) of interest in your dataset?¶

What features in the dataset do you think will help support your investigation into your feature(s) of interest?¶

Rename all columns to their appropriate names¶

Identify and Fix issues¶

Data type of all features¶

Checking the unique items of:¶

Country¶

Checking for duplicates in the dataset¶

Parent school level¶

Parent Job Status¶

Primary education¶

Possessions¶

Gender¶

Math related¶

Student failure attribute¶

Summary analysis of the dataset¶

Saving the processed data¶

Univariate Exploration¶

Most common country and gender¶

Students Primary Education¶

Possession¶

Parent Education Level & Earnings¶

Parent Job Status¶

Distribution of Parent Earning (Pct)¶

Math attributes¶

Distribution of Students performance in different subjects¶

Discuss the distribution(s) of your variable(s) of interest. Were there any unusual points? Did you need to perform any transformations?¶

Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?¶

Bivariate Exploration¶

Countries with highest student score¶

Possession effect on student academic performance¶

Investigation on students at Belgium¶

Primary Education on Academic performance¶

Early Education Influence¶

Parent Education Influence on the Performance of the Students¶

Parent Employment Status influence on Student Performance¶

Parent Earning effect on Student Education Performance¶

Relationship between teacher's behaviour and student examination score¶

Influence of students behaviour on the examination score¶

Gender role on Academic Performance¶

Subject Anxiety on Gender¶

Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?¶

Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?¶

Multivariate Exploration¶

Gender, Score and Country¶

Primary Education and Possession of study place effect on Student Performance¶

Student age in Primary Education effect on their reading performance¶

Student Interest effect on their Performance in Examination¶

Relationships between all numerical features¶

Parent Earning Effect on Students score¶

Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?¶

Were there any interesting or surprising interactions between features?¶

Conclusions¶