import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import plotly.express as px
import plotly.graph_objects as go
trump = pd.read_csv('https://raw.githubusercontent.com/mwaugh0328/Data_Bootcamp_Fall_2017/master/data_bootcamp_1127/trump_data.csv',
encoding='latin-1')
trump.head()
trump.shape
trump.describe()
trump.corr()
trump.dtypes
Drop unnecessary columns such as Unnamed:0.
trump.drop(columns=['Unnamed: 0', 'NAME', 'Party', 'StateName', 'FIPS', '_merge', 'Candidate',
'county', 'state', 'CountyFips'], inplace=True)
trump.rename(columns={'population': 'Population', 'income': 'Income', 'StateCode': 'State',
'CountyName': 'County', 'CountyTotalVote': 'County Total Vote',
'VoteCount': 'Vote Count', 'trump_share': 'Trump Share'}, inplace=True)
Some counties have all uppercase, so we will change it to same format.
trump['County'] = trump['County'].str.title()
top10_trump_support = trump.sort_values(by='Trump Share', ascending=False)[:10]
fig = px.sunburst(top10_trump_support, path=['State','County'], values='Trump Share', color='Trump Share',
color_continuous_scale='OrRd', title='Top 10 Trump Support County and State',
hover_data=['Trump Share'])
fig.update_layout(coloraxis_colorbar_title='Trump Share')
fig.update_traces(hovertemplate="Trump Share: %{customdata[0]}")
fig.show()
You can see that Texas is the number one state that support trump. And the county Roberts has the highest trump share among Texas conties.
fig = px.density_heatmap(trump, x="Income", y="Trump Share",
marginal_x="histogram", marginal_y="histogram",
title='Income and Trump Share')
fig.show()
trump['Population_ln'] = np.log(trump['Population'])
fig = px.scatter(trump, x='Population_ln', y='Trump Share',
labels={'Population_ln': 'Population (ln)'},
title='Relationship Between Population and Trump Share',
template='none')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()
Ordinary Least Squares (OLS) is to fit the fuction with the data by minimizing the sum of squared errors. This method is used to estimate the unknown parameters in a linear regression model.
trump['Income_ln'] = np.log(trump['Income'])
trump['Income_ln'].fillna(method='ffill', inplace=True)
import statsmodels.formula.api as smf
ols_reg = smf.ols('Q("Trump Share") ~ Income_ln', trump).fit()
trump['pred_ols_reg'] = ols_reg.predict()
print(ols_reg.summary())
trump['pred_ols_reg'].head()
We could simply plot the OLS using Plotly trendline function.
fig = px.scatter(trump, x='Income_ln', y='Trump Share', trendline='ols',
labels={'Income_ln': 'Income (ln)'},
title='Relationship Between Population and Trump Share',
template='none', trendline_color_override='red')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()
A random Forest (RF) regressor is an ensemble techqunie that performs regression and classicifcation tasks using multiple decision trees.
trump['Income_ln'] = np.log(trump['Income'])
trump['Income_ln'].fillna(method='ffill', inplace=True)
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(trump[['Income_ln']].values, trump['Trump Share'].values)
skl_rf = rf(n_estimators=100).fit(x_train, y_train)
skl_rf.score(x_test, y_test)
trump['pred_skl_rf'] = skl_rf.predict(trump[['Income_ln']].values)
trump['pred_skl_rf'].head()
fig = go.Figure()
fig.add_trace(go.Scatter(x=trump['Income_ln'], y=trump['Trump Share'],
mode='markers',
name='Trump Share'))
fig.add_trace(go.Scatter(x=trump['Income_ln'], y=trump['pred_skl_rf'],
mode='markers',
name='RF Predicted'))
fig.update_layout(template='none', title='Income and Trump Share Predicted by Random Forest',
xaxis_title='Income (ln)', yaxis_title='Trump Share')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()
The K-Nearest Neighbor (KNN) is a simple supervised machine learning technique that groups and classifies each cases based on their similarities to its neighbor.
from sklearn.neighbors import KNeighborsRegressor as knn
skl_knn = knn(n_neighbors=100).fit(trump[['Income_ln']].values, trump['Trump Share'].values)
trump['pred_skl_knn'] = skl_knn.predict(trump[['Income_ln']].values)
trump['pred_skl_knn'].head()
fig = go.Figure()
fig.add_trace(go.Scatter(x=trump['Income_ln'], y=trump['Trump Share'],
mode='markers',
name='Trump Share'))
fig.add_trace(go.Scatter(x=trump['Income_ln'], y=trump['pred_skl_knn'],
mode='markers',
name='KNN Predicted'))
fig.update_layout(template='none', title='Income and Trump Share Predicted by K Nereast Neighbor',
xaxis_title='Income (ln)', yaxis_title='Trump Share')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()
This is a few of very simple ML model that I've learned from NYU Data Bootcamp class. Although, it is not the most accurate model (maybe very wrong), I have noticed that the random forest regression model prediction is very close to the actual Trump vote share. Also, the locations with the higher Trump vote share tend to have lower income than the locations with the lower Trump vote share. Even though I am not a big political person, I found these vote analysis quite interesting!