An Exploration of the Washington Post Opioid Dataset
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
from ipyleaflet import *
import warnings
warnings.filterwarnings('ignore')
# magic function making plot outputs appear within the notebook
%matplotlib inline
# change the default plot output size
plt.rcParams['figure.figsize'] = [18, 8]
# get current working directory
cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
# set input and output Excel files
in_nation = os.path.join(data_dir, 'OpioidsPerYearTotalAndRate.xlsx')
in_state = os.path.join(data_dir, 'OpioidRxRateByState.xlsx')
in_county = os.path.join(data_dir, 'OpioidRxRate_County.xlsx')
in_rate = os.path.join(data_dir, 'US_Opioid_OverdoseDeathsByState.xlsx')
df_nation_total_rx = pd.read_excel(in_nation, sheet_name='TotalRx')
df_nation_total_rx.head()
# Plot the trend by year
ax = df_nation_total_rx.plot.bar(x=0, y=1, title="Total Prescriptions by Year", rot=0)
ax.set_xlabel("Year")
ax.set_ylabel("Total Prescriptions")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
# Plot the trend by year
ax = df_nation_total_rx.plot.bar(x=0, y=2, title="Prescription Rate Per 100 People", rot=0)
ax.set_xlabel("Year")
ax.set_ylabel("Prescription Rate")
vals = ax.get_yticks().astype(int)
# ax.set_yticklabels(['{:,}'.format(x) for x in vals])
df_nation_county_ptcp = pd.read_excel(in_nation, sheet_name='CountyParticipation')
df_nation_county_ptcp.head()
# Plot the trend by year
ax = df_nation_county_ptcp.plot.bar(x=0, y=2, title="Counties with Data", rot=0)
ax.set_xlabel("Year")
ax.set_ylabel("Total Prescriptions")
vals = ax.get_yticks().astype(int)
# ax.set_yticklabels(['{:,}'.format(x) for x in vals])
# Plot the trend by year
ax = df_nation_county_ptcp.plot.bar(x=0, y=3, title="Percent Counties with Data", rot=0)
ax.set_xlabel("Year")
ax.set_ylabel("Total Prescriptions")
vals = ax.get_yticks().astype(int)
df_state = pd.read_excel(in_state, sheet_name=None, ignore_index=True)
df_template = pd.read_excel(in_state, sheet_name=0)
for index, key in enumerate(df_state.keys()):
df_year = pd.read_excel(in_state, sheet_name=key)
df_template[key] = df_year.iloc[:, 2]
df_state=df_template.drop(columns=['2006 Prescribing Rate'])
df_state.head()
df_state['mean'] = df_state.mean(axis=1)
df_state.head()
out_state_file = in_state.replace(".xlsx", "_Merge.xlsx")
df_state.to_excel(out_state_file, index=False)
ax = df_state.plot.bar(x=1, y=range(2, 14), rot=0, title="Prescription Rate by State")
ax.set_xlabel("State")
ax.set_ylabel("Prescription Rate")
df_state_sort = df_state.sort_values(by='mean', ascending=False)
df_state_sort.head()
ax = df_state_sort.plot.bar(x=1, y=range(2, 14), rot=0, title="Prescription Rate by State")
ax.set_xlabel("State")
ax.set_ylabel("Prescription Rate")
df_state_top10 = df_state_sort.head(10)
df_state_top10
ax = df_state_top10.plot.bar(x=1, y=range(2, 14), rot=0, title="Prescription Rate by State")
ax.set_xlabel("State")
ax.set_ylabel("Prescription Rate")
ax = df_state_sort.plot.bar(x=1, y=14, rot=0, title="Mean Prescription Rate by State")
ax.set_xlabel("State")
ax.set_ylabel("Prescription Rate")
df_county = pd.ExcelFile(in_county)
sheet_names =df_county.sheet_names
print(sheet_names)
num_sheets = len(df_county.sheet_names)
print(f"Total number of sheets: {num_sheets}")
sheet_template = df_county.parse(sheet_name=sheet_names[0])[["County", "State", "FIPS County Code"]]
sheet_template.head()
for sheet_name in sheet_names[:-1]:
print(sheet_name)
sheet_year = df_county.parse(sheet_name)
# print(sheet_year.head()).
sheet_template= pd.merge(sheet_template, sheet_year, how='left', on=['FIPS County Code'])
# print(sheet_template.head())
sheet_template.head()
Filter and select columns:
filter_col = ["County", "State", "FIPS County Code"] + [col for col in sheet_template if col.startswith('20')]
filter_col
Display selected columns:
df_county_merge = sheet_template[filter_col]
df_county_merge.head()
col_names = ["County", "State", "FIPS County Code"] + sheet_names[:-1]
print(col_names)
Change column names:
df_county_merge.columns = col_names
df_county_merge['FIPS County Code']= df_county_merge['FIPS County Code'].astype(str)
df_county_merge['FIPS County Code'] = df_county_merge['FIPS County Code'].apply(lambda x: x.zfill(5))
df_county_merge.head()
Create a mean column:
df_county_merge = df_county_merge.replace(-9999, np.nan)
df_county_merge['mean'] = df_county_merge.mean(axis=1)
df_county_merge.head()
Save the merged opioid data as a new Excel file:
out_county_file = in_county.replace(".xlsx", "_Merge.xlsx")
df_county_merge.to_excel(out_county_file, index=False)
Sort mean prescription rate by county:
df_county = pd.read_excel(out_county_file, sheet_name=0, ignore_index=True)
df_county_sort = df_county.sort_values(by='mean', ascending=False)
df_county_top10 = df_county_sort.head(10)
df_county_top10
ax = df_county_top10.plot.bar(x=0, y=range(3, 15), rot=0, title="Top 10 County")
ax.set_xlabel("County")
ax.set_ylabel("Prescription Rate")
ax = df_county_top10.plot.bar(x=0, y=15, rot=0, title="Mean Prescription Rate by County")
ax.set_xlabel("County")
ax.set_ylabel("Prescription Rate")
Select Tennessee opioid data by county:
df_tn = df_county[df_county['State']=='TN']
df_tn.head(10)
ax = df_tn.plot.bar(x=0, y=15, rot=0, title="Mean Prescription Rate by County")
ax.set_xlabel("County")
ax.set_ylabel("Prescription Rate")
df_tn_sort = df_tn.sort_values(by='mean', ascending=False)
ax = df_tn_sort.plot.bar(x=0, y=15, rot=0, title="Mean Prescription Rate by County")
ax.set_xlabel("County")
ax.set_ylabel("Prescription Rate")
df_tn_top10 = df_tn_sort.head(10)
df_tn_top10
ax = df_tn_top10.plot.bar(x=0, y=15, rot=0, title="Top 10 TN County")
ax.set_xlabel("County")
ax.set_ylabel("Prescription Rate")
ax = df_tn_top10.plot.bar(x=0, y=range(3, 15), rot=0, title="Top 10 County")
ax.set_xlabel("County")
ax.set_ylabel("Prescription Rate")