Reshaping Clinics Data

In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
In [2]:
# get current working directory
cwd = os.getcwd()

# set input and output Excel files
in_excel_file = os.path.join(cwd, 'data', 'pain_clinics_historic_to_20190820.xlsx')
out_excel_file = os.path.join(cwd, 'data', 'KnoxvilleAreapain_clinics.xlsx')

# read Excel file 
df = pd.read_excel(in_excel_file,header=None)

# show the first ten rows
df.head(10)
Out[2]:
0 1 2 3
0 1.0 Absolute Medical Care Medical Director: Hau La, M.D. Facility License Number: 00000592
1 NaN 713 B President Place NaN Status: Lic Surrendered
2 NaN Smyrna, TN 37167 NaN Date of Original Licensure: 03/07/2016
3 NaN 615-667-1299 NaN Date of Expiration: 03/31/2018
4 NaN NaN NaN 1 Disciplinary Action(s)
5 2.0 ACCELERATED MEDICAL OF SEYMOUR Medical Director: GHOLAMREZA SHAREGHI Facility License Number: 00000000
6 NaN 319 BLUE PEACOCK WAY NaN Status: Closed Appl
7 NaN Seymour, TN 37865 NaN Date of Original Licensure:
8 NaN 865-216-8930 NaN Date of Expiration:
9 NaN NaN NaN No Disciplinary Actions
In [4]:
# define a list for storing info about each clinic
clinics = []

# loop through each row
for row_index in range(0, len(df.index)):
    row = df.iloc[row_index]
    if row[0] > 0:
        
        clinic = {}
        clinic['Id'] = row[0]
       
        # fix records with missing information, such as city name, phone, etc.
        if clinic['Id'] == 55:
            df.set_value(row_index + 2, 1, 'Johnson City, TN 37601')
        if clinic['Id'] == 542:
            df.set_value(row_index + 2, 1, 'Columbia, TN 38401')  
        if clinic['Id'] == 98:
            df.set_value(row_index, 2, 'Medical Director: NA')
        if clinic['Id'] in [122, 145, 398, 496, 542, 545, 548, 553, 639, 752]:
            df.set_value(row_index + 3, 1, '000-000-0000')         

        # extract clinic info
        clinic['Name'] = row[1]
        clinic['MedDirector'] = str(row[2]).split(":")[1].strip()
        clinic['FacLicNum'] = row[3].split(":")[1].strip()
        clinic['Street'] = df.iloc[row_index + 1][1].strip()
        clinic['Status'] = df.iloc[row_index + 1][3].split(":")[1].strip()
        clinic['City'] = df.iloc[row_index + 2][1].split(',')[0].strip()
        clinic['State'] = df.iloc[row_index + 2][1].split(',')[1].strip().split(' ')[0]
        clinic['Zip'] = df.iloc[row_index + 2][1].split(',')[1].strip().split(' ')[1]
        clinic['OrgLic'] = df.iloc[row_index + 2][3].split(":")[1].strip()
        clinic['Phone'] = df.iloc[row_index + 3][1].strip()
        clinic['ExpDate'] = df.iloc[row_index + 3][3].split(":")[1].strip()
        clinic['Displinary'] = df.iloc[row_index + 4][3].strip()
        clinics.append(clinic)        
        
    else:
        pass
In [5]:
print(f"Total number of clinics: {len(clinics)}")
Total number of clinics: 849
In [6]:
# convert list to datafroame
out_df = pd.DataFrame(clinics)
In [7]:
# show the first five rows. Note that columns are order alphabetically
out_df.head()
Out[7]:
Id Name MedDirector FacLicNum Street Status City State Zip OrgLic Phone ExpDate Displinary
0 1.0 Absolute Medical Care Hau La, M.D. 00000592 713 B President Place Lic Surrendered Smyrna TN 37167 03/07/2016 615-667-1299 03/31/2018 1 Disciplinary Action(s)
1 2.0 ACCELERATED MEDICAL OF SEYMOUR GHOLAMREZA SHAREGHI 00000000 319 BLUE PEACOCK WAY Closed Appl Seymour TN 37865 865-216-8930 No Disciplinary Actions
2 3.0 ACCEPTANCE HEALTH FRANCISCO AVILA, M.D. 00000530 1012 SOUTH CHANCERY STREET Facility Closed Mc Minnville TN 37110 11/18/2014 931-507-6325 11/30/2016 No Disciplinary Actions
3 4.0 ACCESS SOLUTIONS WELLNESS, P.C. LAWRENCE JOSEPH VALDEZ, M.D. 00000276 805 S. CHURCH STREET SUITE 20 Expired Murfreesboro TN 37130 04/03/2012 615-962-9011 04/30/2014 No Disciplinary Actions
4 5.0 Active Pain Management Solutions David McAfee, M.D. 00000637 620 S Jefferson Ave, Suite 202 Facility Closed Cookeville TN 38501 02/02/2017 931-526-7246 02/28/2019 No Disciplinary Actions
In [8]:
# let's reorder the columns
col_names = ['Id', 'Name', 'Street', 'City', 'State', 'Zip', 'Phone', 'MedDirector', 'FacLicNum', 'Status', 'OrgLic',
            'ExpDate', 'Displinary']
out_df = out_df.reindex(columns=col_names)

# show the first five rows 
out_df.head()
Out[8]:
Id Name Street City State Zip Phone MedDirector FacLicNum Status OrgLic ExpDate Displinary
0 1.0 Absolute Medical Care 713 B President Place Smyrna TN 37167 615-667-1299 Hau La, M.D. 00000592 Lic Surrendered 03/07/2016 03/31/2018 1 Disciplinary Action(s)
1 2.0 ACCELERATED MEDICAL OF SEYMOUR 319 BLUE PEACOCK WAY Seymour TN 37865 865-216-8930 GHOLAMREZA SHAREGHI 00000000 Closed Appl No Disciplinary Actions
2 3.0 ACCEPTANCE HEALTH 1012 SOUTH CHANCERY STREET Mc Minnville TN 37110 931-507-6325 FRANCISCO AVILA, M.D. 00000530 Facility Closed 11/18/2014 11/30/2016 No Disciplinary Actions
3 4.0 ACCESS SOLUTIONS WELLNESS, P.C. 805 S. CHURCH STREET SUITE 20 Murfreesboro TN 37130 615-962-9011 LAWRENCE JOSEPH VALDEZ, M.D. 00000276 Expired 04/03/2012 04/30/2014 No Disciplinary Actions
4 5.0 Active Pain Management Solutions 620 S Jefferson Ave, Suite 202 Cookeville TN 38501 931-526-7246 David McAfee, M.D. 00000637 Facility Closed 02/02/2017 02/28/2019 No Disciplinary Actions
In [9]:
# save the dataframe to Excel
out_df.to_excel(out_excel_file, index=False)
In [ ]: