#import usual libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
#import library to change country names from abbreviations to the full name.
!pip install country_converter --upgrade
#import library for heat map
!pip install geopandas
import geopandas as gpd
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: country_converter in /usr/local/lib/python3.9/dist-packages (1.0.0) Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.9/dist-packages (from country_converter) (1.4.4) Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (1.22.4) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (2022.7.1) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas>=1.0->country_converter) (1.16.0) Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: geopandas in /usr/local/lib/python3.9/dist-packages (0.12.2) Requirement already satisfied: pyproj>=2.6.1.post1 in /usr/local/lib/python3.9/dist-packages (from geopandas) (3.5.0) Requirement already satisfied: pandas>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from geopandas) (1.4.4) Requirement already satisfied: shapely>=1.7 in /usr/local/lib/python3.9/dist-packages (from geopandas) (2.0.1) Requirement already satisfied: fiona>=1.8 in /usr/local/lib/python3.9/dist-packages (from geopandas) (1.9.2) Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from geopandas) (23.0) Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (0.7.2) Requirement already satisfied: munch>=2.3.2 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (2.5.0) Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (6.1.0) Requirement already satisfied: click-plugins>=1.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (1.1.1) Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (22.2.0) Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (2022.12.7) Requirement already satisfied: click~=8.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (8.1.3) Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (1.22.4) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (2022.7.1) Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from munch>=2.3.2->fiona>=1.8->geopandas) (1.16.0) Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.9/dist-packages (from importlib-metadata->fiona>=1.8->geopandas) (3.15.0)
#union density dataset
union = pd.read_csv('https://raw.githubusercontent.com/JulieChandlerDS/ILO_union_density/main/data-SmplN.csv')
#world happiness report 2019 (chosen because the union dataset has the most entries for 2019)
happiness = pd.read_csv('https://raw.githubusercontent.com/JulieChandlerDS/world-happiness-report/main/2019.csv')
#only use 2019 data since that's the most we have
union = union.loc[union['time']==2019.0]
#remove unneccesary columns from union index, we just need the obs value and the name of the country
union = union.drop(['time', 'area_time', 'Column 1','Column 1.1','Column 1.2'], axis = 1)
#do the same for happiness
happiness = happiness.drop(['Overall rank', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], axis = 1)
We'll center the data. This is as simple as subtracting the mean of the score from the observation score.
$x-\bar{x}$
#centering data
union_mean = union['obs_value'].mean()
#happiness centering
happiness_mean = happiness['Score'].mean()
#centering data
union['obs_value'] = union['obs_value'] - union['obs_value'].mean()
happiness['Score'] = happiness['Score'] - happiness['Score'].mean()
Next we'll scale the data by way of means normalization. This simply means we divide by the standard deviation.
$ x' = \frac{x-\bar{x}(x)}{max(x)-min(x)}$
#Scaling data for happiness
happiness['Score'] = (happiness['Score']-happiness['Score'].mean())/happiness['Score'].std()
#Scaling data for unions
union['obs_value'] = (union['obs_value']-union['obs_value'].mean())/union['obs_value'].std()
happiness
Country or region | Score | |
---|---|---|
0 | Finland | 2.121877 |
1 | Denmark | 1.970052 |
2 | Norway | 1.928727 |
3 | Iceland | 1.874824 |
4 | Netherlands | 1.869434 |
... | ... | ... |
151 | Rwanda | -1.862420 |
152 | Tanzania | -1.954952 |
153 | Afghanistan | -1.980107 |
154 | Central African Republic | -2.087912 |
155 | South Sudan | -2.294538 |
156 rows × 2 columns
import country_converter as coco
happiness['Country or region'] = coco.convert(names=happiness['Country or region'], to='ISO3')
happiness
Country or region | Score | |
---|---|---|
0 | FIN | 2.121877 |
1 | DNK | 1.970052 |
2 | NOR | 1.928727 |
3 | ISL | 1.874824 |
4 | NLD | 1.869434 |
... | ... | ... |
151 | RWA | -1.862420 |
152 | TZA | -1.954952 |
153 | AFG | -1.980107 |
154 | CAF | -2.087912 |
155 | SSD | -2.294538 |
156 rows × 2 columns
happiness.rename(columns = {'Country or region':'ref_area'}, inplace = True)
#combine datasets
df = happiness.merge(union, how='left')
df
ref_area | Score | obs_value | |
---|---|---|---|
0 | FIN | 2.121877 | 2.089948 |
1 | DNK | 1.970052 | 2.544855 |
2 | NOR | 1.928727 | 1.623946 |
3 | ISL | 1.874824 | 3.898480 |
4 | NLD | 1.869434 | -0.317729 |
... | ... | ... | ... |
151 | RWA | -1.862420 | -0.844755 |
152 | TZA | -1.954952 | NaN |
153 | AFG | -1.980107 | -0.240062 |
154 | CAF | -2.087912 | NaN |
155 | SSD | -2.294538 | NaN |
156 rows × 3 columns
df
ref_area | Score | obs_value | |
---|---|---|---|
0 | FIN | 2.121877 | 2.089948 |
1 | DNK | 1.970052 | 2.544855 |
2 | NOR | 1.928727 | 1.623946 |
3 | ISL | 1.874824 | 3.898480 |
4 | NLD | 1.869434 | -0.317729 |
... | ... | ... | ... |
151 | RWA | -1.862420 | -0.844755 |
152 | TZA | -1.954952 | NaN |
153 | AFG | -1.980107 | -0.240062 |
154 | CAF | -2.087912 | NaN |
155 | SSD | -2.294538 | NaN |
156 rows × 3 columns
df.rename(columns={'Score':'Happiness'}, inplace=True)
df.rename(columns={'obs_value':'Union Density'}, inplace=True)
df.rename(columns={'ref_area':'Country'}, inplace=True)
df = df.dropna()
df
Country | Happiness | Union Density | |
---|---|---|---|
0 | FIN | 2.121877 | 2.089948 |
1 | DNK | 1.970052 | 2.544855 |
2 | NOR | 1.928727 | 1.623946 |
3 | ISL | 1.874824 | 3.898480 |
4 | NLD | 1.869434 | -0.317729 |
6 | SWE | 1.739169 | 2.444997 |
9 | AUT | 1.652027 | 0.281417 |
11 | CRI | 1.581055 | -0.034799 |
13 | LUX | 1.511880 | 0.392370 |
14 | GBR | 1.479539 | 0.126083 |
16 | DEU | 1.417551 | -0.267800 |
17 | BEL | 1.361851 | 1.551827 |
26 | GTM | 0.924342 | -1.000089 |
29 | ESP | 0.850676 | -0.484158 |
30 | PAN | 0.821029 | 0.286964 |
31 | BRA | 0.802163 | -0.450872 |
33 | SGP | 0.768025 | 0.059511 |
35 | ITA | 0.732988 | 0.630918 |
36 | BHR | 0.711427 | -1.027827 |
37 | SVK | 0.710529 | -0.556277 |
41 | LTU | 0.666508 | -0.761540 |
42 | COL | 0.644947 | -0.911326 |
51 | THA | 0.539837 | -0.988993 |
52 | LVA | 0.478748 | -0.584015 |
54 | EST | 0.436524 | -0.839207 |
57 | JPN | 0.430236 | -0.240062 |
60 | BOL | 0.334109 | -0.506348 |
61 | HUN | 0.315244 | -0.689421 |
62 | PRY | 0.301768 | -0.789278 |
70 | MDA | 0.109515 | -0.140204 |
76 | DOM | 0.016084 | -0.755992 |
78 | TUR | -0.030631 | -0.622849 |
82 | MNG | -0.109688 | 0.536608 |
83 | MKD | -0.119570 | -0.245609 |
88 | MAR | -0.178863 | -0.567373 |
91 | IDN | -0.193237 | -0.450872 |
97 | GHA | -0.369319 | -0.240062 |
101 | BEN | -0.470835 | 0.592085 |
105 | ZAF | -0.615474 | 0.442298 |
109 | PSE | -0.638832 | 0.009582 |
110 | SEN | -0.652307 | -0.761540 |
111 | SOM | -0.663986 | -0.517444 |
115 | ARM | -0.761909 | -0.034799 |
118 | GEO | -0.797844 | -0.179038 |
120 | KEN | -0.806828 | -0.633944 |
123 | TUN | -0.849950 | 0.941586 |
129 | LKA | -0.935296 | -0.611754 |
132 | UKR | -0.965840 | 0.858372 |
133 | ETH | -1.007166 | -0.650587 |
137 | ZMB | -1.167975 | 0.153821 |
138 | TGO | -1.187739 | 0.148273 |
143 | LSO | -1.441980 | -0.905779 |
151 | RWA | -1.862420 | -0.844755 |
153 | AFG | -1.980107 | -0.240062 |
#scatter plot
plt.style.use('fivethirtyeight')
plt.scatter(df['Happiness'], df['Union Density'])
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
plt.show();
plt.hist(df['Happiness'])
plt.show()
plt.hist(df['Union Density'])
plt.show()
import seaborn as sns
# Extract relevant columns from the dataframe
corr_df = df[['Union Density', 'Happiness']]
# Compute the correlation matrix
corr_matrix = corr_df.corr()
# Plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
# Add title and axis labels
plt.title('Correlation Heatmap of Union Density and GDP per capita')
plt.xlabel('Variable')
plt.ylabel('Variable')
# Show the plot
plt.show()
A linear regression seems most appropriate for our testing of one variable to cause another, since that's it's usual use.
$Y_i=f(X_i, \beta)+e_i$
y_var_name = 'GDP_PCAP_GWTH_PCNT'
X_var_names = ['GCF_GWTH_PCNT']
#linear regression
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
sns.regplot(data=df, y= df['Union Density'], x = df['Happiness']);
#pooled linear regression
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
sns.regplot(data=df, y= df['Union Density'], x = df['Happiness']);
from sklearn.cluster import KMeans
# Extract relevant columns from the dataframe
X = df[['Union Density', 'Happiness']]
# Define number of clusters
k = 4
# Fit KMeans algorithm to the data
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
# Add a new column to the dataframe with the cluster labels
df['Cluster'] = kmeans.labels_
# Plot the clusters
plt.scatter(df['Union Density'], df['Happiness'], c=df['Cluster'])
plt.xlabel('Union Density')
plt.ylabel('GDP per capita')
plt.title('K-means clustering of union density and GDP per capita')
plt.show()
/usr/local/lib/python3.9/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( <ipython-input-23-be26cc17b93a>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Cluster'] = kmeans.labels_
OLS, or Ordinary least squares, gives us some estimates of values in our linear regression by minimizing the sum of squares in the line that was fit to our data.
$\hat{{\beta}}=\left(\mathbf{X}^{\top} \mathbf{X}\right)^{-1} \mathbf{X}^{\top} \mathbf{y}$
#OLS for union density vs happiness
np.random.seed(9876789)
X = df['Happiness']
y = df['Union Density']
model = sm.OLS(y, X, missing='drop')
results = model.fit()
results.summary()
Dep. Variable: | Union Density | R-squared (uncentered): | 0.190 |
---|---|---|---|
Model: | OLS | Adj. R-squared (uncentered): | 0.175 |
Method: | Least Squares | F-statistic: | 12.42 |
Date: | Thu, 06 Apr 2023 | Prob (F-statistic): | 0.000886 |
Time: | 00:21:10 | Log-Likelihood: | -70.649 |
No. Observations: | 54 | AIC: | 143.3 |
Df Residuals: | 53 | BIC: | 145.3 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Happiness | 0.4025 | 0.114 | 3.524 | 0.001 | 0.173 | 0.632 |
Omnibus: | 15.667 | Durbin-Watson: | 1.581 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 18.327 |
Skew: | 1.166 | Prob(JB): | 0.000105 |
Kurtosis: | 4.646 | Cond. No. | 1.00 |
# Define X and y variables
y_var_name = df['Union Density']
X_var_names = df['Happiness']
# Carve out y variable
pooled_y= y_var_name
# Carve out X variable
pooled_X= X_var_names
# Add the placeholder for the regression intercept. When the model is fitted, the coefficient of this variable is the regression model’s intercept β_0.
pooled_X = sm.add_constant(pooled_X)
#Build the OLS regression model:
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
#Train the model on the (y, X) data set and fetch the training results:
pooled_olsr_model_results = pooled_olsr_model.fit()
#Print the training summary:
print(pooled_olsr_model_results.summary())
OLS Regression Results ============================================================================== Dep. Variable: Union Density R-squared: 0.197 Model: OLS Adj. R-squared: 0.182 Method: Least Squares F-statistic: 12.78 Date: Thu, 06 Apr 2023 Prob (F-statistic): 0.000765 Time: 00:21:10 Log-Likelihood: -70.391 No. Observations: 54 AIC: 144.8 Df Residuals: 52 BIC: 148.8 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -0.0897 0.127 -0.707 0.483 -0.344 0.165 Happiness 0.4216 0.118 3.575 0.001 0.185 0.658 ============================================================================== Omnibus: 14.562 Durbin-Watson: 1.596 Prob(Omnibus): 0.001 Jarque-Bera (JB): 16.366 Skew: 1.119 Prob(JB): 0.000279 Kurtosis: 4.506 Cond. No. 1.28 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The Pearson correlation coefficient finds a linear relationship between two datasets. Similar to other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship.
Note that the p-value relies on a normal distribution.
$r = \frac{\Sigma(x-m_x)(y-m_y)}{\sqrt\Sigma(x-m_x)^2\Sigma(y-m_y)^2}$
np.isnan(X).any()
np.isnan(y).any()
np.isinf(X).any()
np.isinf(y).any()
False
X = np.nan_to_num(X)
y = np.nan_to_num(y)
sp.stats.pearsonr(X, y)
PearsonRResult(statistic=0.44422304265859053, pvalue=0.0007654853445457307)