Medications are classified using a linear support vector classifier, trained by a family physician. The medications are grouped in 6 categories:
1. Oral antibiotics
2. Opioids (excluding methadone and suboxone)
3. Opioid addiction treatments (methadone, suboxone, naloxone, etc.)
4. Benzodiazpeines and z-drugs (Zopiclone)
5. All other medications
6. Non-medications (massage, physiotherapy, glucose test strips, etc.)
The combined average F1-score of the classifier is ~90%. This classifier will be periodically re-assessed to ensure continued accuracy.
def clean_medication_text(raw_text):
temp_text = raw_text.lower()
#exceptions
for item in ['b12', 'b6', 'b2', 'b50', 'b100', 'b 100', 'b 12', 'b 50']:
if item in temp_text:
if 'tab 50' not in temp_text and 'tab 100' not in temp_text and 'tb24' not in temp_text and 'tab250' not in temp_text and 'tab 250' not in temp_text: #this eliminates the possibility that 'b 50'='tab 500 mg' I think tb24 is short for 24 tabs, as it shows up often
return 'vitamin b'
if 'injection' in temp_text:
return 'injectable treatment'
if 'solution' in temp_text:
for item in ['1%', '2%', '4%', '0.1%', '0.2%']:
if item in temp_text:
return 'topical treatment'
for item in [' ottic', 'ophth', 'erythro base', 'polymyxin', 'elidel', 'clobatesol', 'glaxyl', 'glaxol base', 'glaxal', 'eye oint', 'clotrimaderm crm', ' gel ', ' gel', 'lyderm', 'neostrata', 'ointment', 'topical', 'cream', 'ophthalamic', ' opth', 'ophthalmic ont', 'oph oint', 'dilusol', 'solugel', 'erythro/hc/niz', 'biobase', 'ophthalmic', 'benzoyl', ' otic', 'lotion']: #this eliminates any of the erythromycin ointments, preventing oral forms of erythromycin from getting classified as non-oral abx
if item in temp_text:
if 'oral solution' not in temp_text and 'powder for solution' not in temp_text:
return 'topical treatment'
for item in ['vaginal', 'rectal']:
if item in temp_text:
return 'vaginal or rectal treatment'
for item in ['vaccine']:
if item in temp_text:
return 'vaccination'
#find the core name of the medication
try:
match = re.finditer('([0-9]+(\.[0-9]+)?)', temp_text)
for item in match:
if item.start() == 0:
pass
else:
temp_text = temp_text[0:item.start()]
break
except:
pass
temp_text = temp_text.replace('prescribed:', '')
temp_text = temp_text.replace('name:', '')
temp_text = temp_text.replace('qty', '')
temp_text = temp_text.replace('apply', '')
temp_text = temp_text.replace('take', '')
temp_text = temp_text.replace('daily', '')
temp_text = temp_text.replace('sodium', '')
temp_text = temp_text.replace('dom-', '')
temp_text = temp_text.replace('teva-', '')
temp_text = temp_text.replace('jamp-', '')
temp_text = temp_text.replace('novo-', '')
temp_text = temp_text.replace('(', ' ')
temp_text = temp_text.replace(')', ' ')
temp_text = temp_text.replace('ava-', '')
temp_text = temp_text.replace('ratio-', '')
temp_text = temp_text.replace('pms-', '')
temp_text = temp_text.replace('auro-', '')
temp_text = temp_text.replace('ntp-', '')
temp_text = temp_text.replace('gen-', '')
temp_text = temp_text.replace('act-', '')
temp_text = temp_text.replace('ach-', '')
temp_text = temp_text.replace('taro-', '')
temp_text = temp_text.replace('phl-', '')
temp_text = temp_text.replace('apo-', '')
temp_text = temp_text.replace('accel-', '')
temp_text = temp_text.replace(' hcl', '')
temp_text = temp_text.replace(' dose', '')
temp_text = temp_text.replace(' frequency', '')
temp_text = temp_text.replace(' amount', '')
temp_text = temp_text.replace('current meds', '')
temp_text = temp_text.replace('recorded ', '')
temp_text = temp_text.replace(' -', ' ')
temp_text = temp_text.replace('-', ' ')
temp_text = temp_text.replace('.', ' ')
temp_text = temp_text.replace('|', ' ')
temp_text = temp_text.replace(',', ' ')
temp_text = temp_text.replace('!', ' ')
temp_text = temp_text.replace('?', ' ')
temp_text = temp_text.replace('/', ' ')
temp_text = temp_text.replace(':', ' ')
temp_text = temp_text.replace(';', ' ')
temp_text = temp_text.replace(' take', ' ')
temp_text = temp_text.replace('[xhcnx]', ' ')
temp_text = temp_text.replace('[xdoctorx]', ' ')
temp_text = temp_text.replace(' as directed', ' ')
temp_text = temp_text.lstrip(' ')
temp_text = temp_text.rstrip(' ')
temp_text = re.sub(' +', ' ', temp_text)
return temp_text
('topical treatment', 42416)
('metformin', 6695)
('crestor', 5325)
('ventolin hfa', 4704)
('amoxicillin', 4536)
('synthroid', 4066)
('naproxen', 3811)
('vitamin b', 3483)
('clonazepam', 3196)
('cipralex', 3195)
('injectable treatment', 3152)
('lipitor', 2892)
('hydrochlorothiazide', 2878)
('percocet', 2824)
('coversyl', 2723)
('nasonex', 2706)
('ramipril', 2651)
('flovent hfa', 2646)
('norvasc tab', 2541)
('suboxone', 2539)
('synthroid tab', 2533)
('gabapentin', 2525)
('amlodipine', 2450)
('diamicron mr', 2407)
('ativan', 2376)
('azithromycin', 2275)
('tecta', 2235)
('baclofen', 2177)
('lyrica', 2118)
('tylenol with codeine no', 2063)
('acetaminophen', 1994)
('apo naproxen tab', 1985)
('lorazepam', 1946)
('reactine', 1938)
('bisoprolol', 1921)
('symbicort', 1910)
('cymbalta', 1879)
('vitamin d', 1806)
('effexor xr', 1770)
('alesse', 1766)
('omnaris', 1733)
('advair', 1732)
('sertraline', 1692)
('pantoloc', 1692)
('pantoprazole', 1682)
('trazodone', 1643)
('methadone', 1590)
('flexeril', 1507)
('lantus', 1490)
('tylenol w codeine no', 1471)
('metoprolol', 1426)
('asa', 1413)
('betaderm crm', 1394)
('keflex', 1391)
('arthrotec', 1366)
('omeprazole', 1363)
('amitriptyline', 1360)
('macrobid', 1357)
('amoxil', 1355)
('atorvastatin', 1335)
('ranitidine', 1324)
('atenolol', 1272)
('hydromorphone', 1258)
('lasix', 1241)
('janumet', 1239)
('celebrex', 1223)
('vimovo modified release tablet', 1197)
('zopiclone', 1192)
('altace', 1186)
('lamisil crm', 1176)
('ciprodex', 1168)
('elavil', 1154)
('valtrex', 1134)
('vaginal or rectal treatment', 1130)
('ibuprofen', 1115)
('levothyroxine', 1108)
('hycodan syrup', 1094)
('pariet enteric coated tablet', 1070)
('ferrous gluconate', 1058)
('peg', 1049)
('prednisone', 1039)
('concerta', 1033)
('chlorthalidone', 1032)
('citalopram', 1028)
('lactulose', 1013)
('diclectin', 1011)
('wellbutrin xl', 981)
('norvasc', 980)
('cipro', 972)
('eltroxin', 963)
('nexium', 954)
('celexa', 951)
('losec', 932)
('nortriptyline', 925)
('cialis', 916)
('warfarin', 900)
('candesartan', 886)
('viagra', 879)
('actonel', 877)
Total number labeled: 3427
Number labeled as oral antibiotic: 662
Number labeled as opioid: 252
Number labeled as opioid addiction treatment: 12
Number labeled as z-drug: 105
Number labeled as non-medication: 837
model = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LinearSVC()),
])
BENZO
precision Min/Avg/Max: 91% / 97% / 100%
recall Min/Avg/Max: 75% / 91% / 100%
f1-score Min/Avg/Max: 82% / 94% / 100%
NOTMED
precision Min/Avg/Max: 92% / 95% / 99%
recall Min/Avg/Max: 88% / 91% / 94%
f1-score Min/Avg/Max: 91% / 93% / 96%
OTHERMED
precision Min/Avg/Max: 82% / 85% / 88%
recall Min/Avg/Max: 92% / 94% / 97%
f1-score Min/Avg/Max: 88% / 89% / 91%
OPIOID_TX
precision Min/Avg/Max: 0% / 88% / 100%
recall Min/Avg/Max: 0% / 63% / 100%
f1-score Min/Avg/Max: 0% / 69% / 100%
OPIOID
precision Min/Avg/Max: 81% / 91% / 98%
recall Min/Avg/Max: 84% / 91% / 100%
f1-score Min/Avg/Max: 86% / 91% / 96%
ABX
precision Min/Avg/Max: 91% / 93% / 97%
recall Min/Avg/Max: 71% / 76% / 83%
f1-score Min/Avg/Max: 81% / 84% / 88%
Accuracy Min/Avg/Max: 88% / 89% / 92%
Precision Min/Avg/Max: 88% / 90% / 92%
Recall Min/Avg/Max: 88% / 89% / 92%
F1 Min/Avg/Max: 87% / 89% / 92%