# Arcos Data Parsing R
## Read Data
```
library(pdftools)
library(tidyverse)
# if you want to use here():
# 0) install pkg here
# 1) start a project and put this script somewhere in
# its directory (say <project>/code)
# 2) put report_yr_2021.pdf in the directory <project>/data/raw
cool <- here::here('data/raw', 'report_yr_2021.pdf')
file_path <-
ifelse(file.exists(cool), cool,
'https://www.deadiversion.usdoj.gov/arcos/retail_drug_summary/report_yr_2021.pdf')
arcos_text <- pdf_text(file_path)
# data from report 1 only
# first collapse multiple strings (one per pdf page) into one
arcos <- str_flatten(arcos_text[grep('ARCOS 3 - REPORT 01', arcos_text)])
# then split on newlines and vectorize
a1 <- unlist(str_split(arcos, '\n'))
```
## Inspect and Clean Data
```
# check pattern of lines with and without data we need:
# all lines are either
length(a1) ==
# 1) state/drug ids and corresponding gram values by zip,
length(grep('^DRUG|^STATE|^\\d{3}', a1)) +
# 2) empty lines, or
length(empty <- grep('^$', a1)) +
# 3) repeated title/header/summary rows
length(non_data <- grep('^DATE RANGE|WITHIN STATE$|REGISTRANT|Total', a1)) # TRUE
# remove non-data and empty elements
a1 <- a1[-c(non_data, empty)]
# positions within the vector where state/drug ids occur
drug_idx <- grep('^DRUG', a1)
state_idx <- grep('^STATE', a1)
# to count reps of drug ids, leave state ids out (and vice versa)
a1_drug_only <- a1[-state_idx]
drug_only_idx <- grep('^DRUG', a1_drug_only)
a1_state_only <- a1[-drug_idx]
state_only_idx <- grep('^STATE', a1_state_only)
# verify that these pick out the same elements of the vector
identical(a1_drug_only[drug_only_idx], a1[drug_idx]) # TRUE
identical(a1_state_only[state_only_idx], a1[state_idx]) # TRUE
# distances between id positions (minus one) are numbers of reps
# -- 1 + length(vector) is a "phantom" position just after the
# end of the entire vector, to account for the last group
drug_reps <- diff(c(drug_only_idx, 1 + length(a1_drug_only))) - 1
state_reps <- diff(c(state_only_idx, 1 + length(a1_state_only))) - 1
# check that total lengths of ids are equal to length of data
data_idx <- grep('^\\d{3}', a1)
sum(drug_reps) == length(data_idx) # TRUE
sum(state_reps) == length(data_idx) # TRUE
```
## Final Data
```
str2cols <- function(x) {
out <- unlist(str_split(x, boundary('word')))
names(out) <- c('zip3', paste0('q', 1:4), 'totalgrams')
return(out)
}
# applied to first data row
# > str2cols(a1[data_idx][1])
# zip3 q1 q2 q3 q4 totalgrams
# "995" "4,861.98" "5,185.29" "5,364.84" "5,264.97" "20,677.08"
arcos2021 <-
bind_cols(drug = rep(a1[drug_idx], drug_reps),
state = rep(a1[state_idx], state_reps),
bind_rows(lapply(a1[data_idx], str2cols)),
year = 2021) %>%
mutate(across(q1:totalgrams, ~ parse_number(.x))) %>%
extract(drug, c('drugcode', 'drugname'),
'^DRUG: ([[:alnum:]]+) - ([[:print:]]+)') %>%
separate(state, c(NA, 'state'), ' - ')
```
## Some Drugs Appear More Than Once Per Zip
It's because some zip3s are associated with more than one state. That seems odd.
```
arcos2021 %>%
count(drugcode, zip3) %>%
filter(n > 1) %>%
left_join(arcos2021) %>%
select(drugcode, drugname, state, zip3) %>%
knitr::kable()
```
|drugcode |drugname |state |zip3 |
|:--------|:--------------------------------|:--------------|:----|
|1100 |AMPHETAMINE |AMERICAN SAMOA |967 |
|1100 |AMPHETAMINE |HAWAII |967 |
|1724 |METHYLPHENIDATE (DL;D;L;ISOMERS) |AMERICAN SAMOA |967 |
|1724 |METHYLPHENIDATE (DL;D;L;ISOMERS) |HAWAII |967 |
|9050 |CODEINE |AMERICAN SAMOA |967 |
|9050 |CODEINE |HAWAII |967 |
|9143 |OXYCODONE |CONNECTICUT |063 |
|9143 |OXYCODONE |NEW YORK |063 |
|9143 |OXYCODONE |AMERICAN SAMOA |967 |
|9143 |OXYCODONE |HAWAII |967 |
|9150 |HYDROMORPHONE |AMERICAN SAMOA |967 |
|9150 |HYDROMORPHONE |HAWAII |967 |
|9193 |HYDROCODONE |AMERICAN SAMOA |967 |
|9193 |HYDROCODONE |HAWAII |967 |
|9300 |MORPHINE |CONNECTICUT |063 |
|9300 |MORPHINE |NEW YORK |063 |
|9300 |MORPHINE |AMERICAN SAMOA |967 |
|9300 |MORPHINE |HAWAII |967 |
|9801 |FENTANYL BASE |AMERICAN SAMOA |967 |
|9801 |FENTANYL BASE |HAWAII |967 |
###### tags: `RegNJP`