An Exploration of the Washington Post Opioid Dataset for Tennessee
Contact
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
from ipyleaflet import *
import warnings
warnings.filterwarnings('ignore')
# magic function making plot outputs appear within the notebook
%matplotlib inline
# change the default plot output size
plt.rcParams['figure.figsize'] = [18, 8]
The Tennesee opioid data (2006-2012) can be downloaded via this direct link. Note that the downloaded zip file is only 139 MB. When uncompressed, the tab separated values (TSV) file is 2.4 GB, which contains 5.75 millions rows. If you are interested in opioid data for other U.S. states, check this DEA opioid database by the Washington Post.
# set file paths of opioid and population data
data_path = "/media/hdd/Data/Opioids/arcos-tn-statewide-itemized.tsv"
pop_path = '/media/hdd/Data/Opioids/tn-county-pop-2019.csv'
# read population data
pop = pd.read_csv(pop_path)
# read opioid data
data = pd.read_csv(data_path, sep='\t')
There are 42 columns in this dataset. The DOSAGE_UNIT column is the number of pills. More information about each individual column can be found in the DEA ARCOS's Handbook.
Some key columns we will be using include:
for col in data.columns:
print(col)
print(f"\nTotal number of columns: {len(data.columns)}")
# Take a look at the first five rows
data.head()
# Take a look at the last five rows
data.tail()
# Select 11 columns for further investigation
sel_cols = ['REPORTER_NAME', 'BUYER_BUS_ACT', 'BUYER_NAME', 'BUYER_CITY', 'BUYER_COUNTY', 'BUYER_STATE',
'BUYER_ZIP', 'DRUG_NAME', 'TRANSACTION_DATE', 'DOSAGE_UNIT', 'Combined_Labeler_Name']
data = data[sel_cols]
for col in data.columns:
print(col)
data.head()
print(f"Total number of rows: {len(data.index):,}")
# Check the transaction date format. Note that it is in the MMDDYYYY format, but we need to pad zeros
data['TRANSACTION_DATE'].head()
# Convert TRANSACTION_DATE from String to DateTime and pad zeros
data['TRANSACTION_DATE'] = pd.to_datetime(data['TRANSACTION_DATE'].astype(str).str.zfill(8), format='%m%d%Y')
# Take a look at the revised TRANSACTION_DATE
data['TRANSACTION_DATE'].head()
# Take a look at the first five rows again with revised TRANSACTION_DATE
data.head()
total_pills = int(data.DOSAGE_UNIT.sum())
print(f"Total number of pills distributed to Tennessee (2006-2012): {total_pills: ,}")
total_pop = pop['Pop'].sum()
print(f"Total population in Tennessee: {total_pop:,}")
avg_pills = int(total_pills / total_pop)
print(f"Average number of pills per person: {avg_pills}")
# Check TN counties
counties = data['BUYER_COUNTY']
print(sorted(set(counties)))
print(f"\nTotal number of counties: {len(set(counties))}")
# Check TN cities
cities = data['BUYER_CITY']
# print(sorted(set(cities)))
print(f"Total number of cities: {len(set(cities))}")
Let's summarize the dataset by year to see how the number of opioids were distributed to Tennessee over time.
pills_by_year = data[['TRANSACTION_DATE', 'DOSAGE_UNIT']].groupby(data.TRANSACTION_DATE.dt.year).sum()
pills_by_year
# Plot the trend by year
ax = pills_by_year.plot.bar(title="Pill Count Summary by Year", rot=0)
ax.set_xlabel("Year")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
Let's summarize the dataset by month to see how the number of opioids were distributed to Tennessee over time.
pills_by_month = data[['TRANSACTION_DATE', 'DOSAGE_UNIT']].groupby(data.TRANSACTION_DATE.dt.month).sum()
pills_by_month['Month'] = pills_by_month.index
pills_by_month.index = pills_by_month['Month'].apply(lambda x: calendar.month_abbr[x])
pills_by_month = pills_by_month[['DOSAGE_UNIT']]
pills_by_month
# Plot the trend by month
ax = pills_by_month.plot.bar(title="Pill Count Summary by Month", rot=0)
ax.set_xlabel("Month")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
Let's summarize the dataset by day to see how the number of opioids were distributed to Tennessee over time.
pills_by_day = data[['TRANSACTION_DATE', 'DOSAGE_UNIT']].groupby(data.TRANSACTION_DATE.dt.day).sum()
pills_by_day
# Plot the trend by day
ax = pills_by_day.plot(title="Pill Count Summary by Day", rot=0, style='-', marker='o')
ax.set_xlabel("Day")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
Let's summarize the dataset by day of week to see how the number of opioids were distributed to Tennessee over time.
pills_by_dayofweek = data[['TRANSACTION_DATE', 'DOSAGE_UNIT']].groupby(data.TRANSACTION_DATE.dt.dayofweek).sum()
pills_by_dayofweek['DayofWeek'] = pills_by_dayofweek.index
pills_by_dayofweek.index = pills_by_dayofweek['DayofWeek'].apply(lambda x: calendar.day_abbr[x])
pills_by_dayofweek = pills_by_dayofweek[['DOSAGE_UNIT']]
pills_by_dayofweek
ax = pills_by_dayofweek.plot.bar(title="Pill Count Summary by Day of Week", rot=0)
ax.set_xlabel("Day of Week")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
Let's look at the total pills per month by year:
pills_by_year_month = data[['TRANSACTION_DATE', 'DOSAGE_UNIT']].groupby([data.TRANSACTION_DATE.dt.month,
data.TRANSACTION_DATE.dt.year]).sum()
# pills_by_year_month.index
# pills_by_month['Month'] = pills_by_month.index
# pills_by_month.index = pills_by_month['Month'].apply(lambda x: calendar.month_abbr[x])
# pills_by_month = pills_by_month[['DOSAGE_UNIT']]
ax = pills_by_year_month['DOSAGE_UNIT'].unstack().plot(style='-', marker='o')
ax.set_xlabel("Month")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
Let's take a look at the top 10 distributors:
distributors = data[['REPORTER_NAME', 'DOSAGE_UNIT']].groupby('REPORTER_NAME').sum()
distributors_top10 = distributors.sort_values('DOSAGE_UNIT', ascending=False).head(10)
distributors_top10
The table above shows that the 2nd distributor (WALGREEN CO) and 8th distributor (WALGREEN CO.) are treated as different distirbutors. We need to remove the extra period from the 8th distributor.
data['REPORTER_NAME'] = data['REPORTER_NAME'].str.replace('.', '', regex=False)
distributors = data[['REPORTER_NAME', 'DOSAGE_UNIT']].groupby('REPORTER_NAME').sum()
distributors_top10 = distributors.sort_values('DOSAGE_UNIT', ascending=False).head(10)
distributors_top10
distributors_top10.index = distributors_top10.index.str[:15]
ax = distributors_top10.plot.bar(title="Pill Count Summary by Distributor", rot=0)
ax.set_xlabel("Distributor")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
Let's take a look at the top 10 manufacturers:
manufacturers = data[['Combined_Labeler_Name', 'DOSAGE_UNIT']].groupby('Combined_Labeler_Name').sum()
manufacturers_top10 = manufacturers.sort_values('DOSAGE_UNIT', ascending=False).head(10)
manufacturers_top10
manufacturers_top10.index = manufacturers_top10.index.str[:15]
ax = manufacturers_top10.plot.bar(title="Pill Count Summary by Manufacturer", rot=0)
ax.set_xlabel("Manufacturer")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
Let's take a look at the top 10 buyers:
buyers = data[['BUYER_NAME', 'DOSAGE_UNIT']].groupby('BUYER_NAME').sum()
buyers_top10 = buyers.sort_values('DOSAGE_UNIT', ascending=False).head(10)
buyers_top10
buyers_top10.index = buyers_top10.index.str[:15]
ax = buyers_top10.plot.bar(title="Pill Count Summary by Buyer", rot=0)
ax.set_xlabel("Buyer")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
counties = data[['BUYER_COUNTY', 'DOSAGE_UNIT']].groupby('BUYER_COUNTY').sum()
counties_top10 = counties.sort_values('DOSAGE_UNIT', ascending=False).head(10)
counties_top10
ax = counties_top10.plot.bar(title="Pill Count Summary by County", rot=0)
ax.set_xlabel("County")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])
cities = data[['BUYER_CITY', 'DOSAGE_UNIT']].groupby('BUYER_CITY').sum()
cities_top10 = cities.sort_values('DOSAGE_UNIT', ascending=False).head(10)
cities_top10
ax = cities_top10.plot.bar(title="Pill Count Summary by City", rot=0)
ax.set_xlabel("City")
ax.set_ylabel("Pill Count")
vals = ax.get_yticks().astype(int)
ax.set_yticklabels(['{:,}'.format(x) for x in vals])