1. The Data

The data I am working with includes information on people who were released from prison at some point from January 1, 2018 through December 31, 2020. It includes basic demographic information, such as age, sex and race, as well as information about the types of crimes they were convicted of, the length of their stay in prison, their participation in educational and other programs while incarcerated, and violations of rules of conduct while incarcerated. The data also includes information about whether the person was employed from 15 months prior up to their incarcerations and up to 15 months following their release from prison.

The dataset includes one variable for each possible outcome of participating in educational and other types of programming while incarcerated. While in some analyses we might want to compare groups based on how they completed classes in these program areas, for example those who successfully completed an anger management course as compared to those who participated but did not successfully complete, research also suggests that there is often a benefit from participation regardless of the outcome. Given the primary emphasis in this analysis on the relationship between factory employment while incarcerated and post-release employment, and an interest in creating a parsimonious model, program outcome variables are recoded into program participation variables. In other words, the counts of successful, unsuccessful, and no fault completions are added into one variable for each program area that is a count of the number of times a person was enrolled in a class in each program area.

There are four variables in the dataset that provide information on people’s employment outside their incarceration period. Two are measures of employment prior to incarceration and two are measures of employment after release from prison. All of these data are binary variables that indicate whether the person had wages reported in the federal Unemployment Insurance (UI) Wage Records system. UI Wage Records are reported quarterly by employers required to do so and other employers who report them voluntary. The EMPLOYMENT_PRE indicator is coded as a 1 if the person had wages reported in any of the five quarters prior to their incarceration. The STABLE_PRE indicator is coded as a 1 if the person had wages reported in the fifth, fourth, and third quarters prior to incarceration. The EMPLOYMENT_POST indicator is coded as a 1 if the person had wages reported in any of the five quarters following their incarceration period. The STABLE_POST indicator is coded as a 1 if the person had wages reported in the third, fourth, and fifth quarters after their release from prison.

SEX RACE ETHNIC ADMISSION_TYPE OFFENSE_GROUP EXIT_YEAR AGE_AT_EXIT RELEASE_TYPE
M W N I NONVIOLENT 2019 47 PRB
M W N L NONVIOLENT 2020 48 PPR
M B N S VIOLENT 2018 54 PPR
M W N I DRUG 2020 60 PRB
M W N S NONVIOLENT 2019 46 PPR

2. Exploratory Data Analysis

Ultimately, my intention with this project is to test whether there is a difference in post-release employment outcomes between people who were employed in factory jobs while incarcerated as compared to those who were not. Given there are 48,761 observations in the dataset, with only 2,686 of them having participated in some type factory employment while incarcerated, my exploratory data analysis will be guided by the purpose of creating a matched sample (via propensity score matching) to be analyzed subsequently with regression techniques.

# Replacing missing values, recoding institutional programming variables,  
# transforming/recoding categorical variables as factors, and transforming integers as numeric.
library(tidyr)

# MatchIt is not able to work with fields that have missing values.
colmissing <- apply(data,2,function(x){sum(is.na(x))})
colmissing

transformed_data <- transform(
    data,
    MVE_FACTORY_JOB=as.factor(ifelse(data$MVE_FACTORY_JOB =='Y', 1, 0)),
    SEX=as.factor(SEX),
    RACE=as.factor(RACE),
    ETHNIC=as.factor(ETHNIC),
    ADMISSION_TYPE=as.factor(ADMISSION_TYPE),
    OFFENSE_GROUP=as.factor(OFFENSE_GROUP),
    EXIT_YEAR=as.factor(EXIT_YEAR),
    AGE_AT_EXIT=as.numeric(AGE_AT_EXIT),
    RELEASE_TYPE=as.factor(RELEASE_TYPE),
    STAY_DAYS=as.numeric(STAY_DAYS),
    COUNT_CV=as.numeric(COUNT_CV),
    MENTAL_HEALTH_SCORE=as.factor(MENTAL_HEALTH_SCORE),
    MEDICAL_SCORE=as.factor(MEDICAL_SCORE),
    EDUCATION_SCORE=as.factor(EDUCATION_SCORE),
    VOCATIONAL_SCORE=as.factor(ifelse(is.na(VOCATIONAL_SCORE), 0, VOCATIONAL_SCORE)),
    WORK_RELEASE=as.factor(WORK_RELEASE),
    FACTORY_DAYS=as.numeric(FACTORY_DAYS),
    MVE_FACTORY=as.factor(MVE_FACTORY),
    ANGER_MGMT_CLASSES=as.numeric(ANGER_SFL+ANGER_UNS+ANGER_NOF),
    CAREER_TECHNICAL_CLASSES=as.numeric(CT_SFL+CT_UNS+CT_NOF),
    COGNITIVE_CLASSES=as.numeric(COGNITIVE_SFL+COGNITIVE_UNS+COGNITIVE_NOF),
    INSTITUTIONAL_TRTMNT_CLASSES=as.numeric(IT_SFL+IT_UNS+IT_NOF),
    LIFE_SKILLS_CLASSES=as.numeric(LS_SFL+LS_UNS+LS_NOF),
    PARENTING_CLASSES=as.numeric(PARENTING_SFL+PARENTING_UNS+PARENTING_NOF),
    RE_ENTRY_CLASSES=as.numeric(RE_ENTRY_SFL+RE_ENTRY_UNS+RE_ENTRY_NOF),
    EDUCATION_CLASSES=as.numeric(EDUCATION_SFL+EDUCATION_UNS+EDUCATION_NOF),
    COLLEGE_CLASSES=as.numeric(COLLEGE_SFL+COLLEGE_UNS+COLLEGE_NOF),
    EMPLOYMENT_PRE=as.factor(EMPLOYMENT_PRE),
    STABLE_PRE=as.factor(STABLE_PRE),
    EMPLOYMENT_POST=as.factor(EMPLOYMENT_POST),
    STABLE_POST=as.factor(STABLE_POST)
)

matching_model <- MVE_FACTORY_JOB ~ SEX + RACE + ETHNIC + ADMISSION_TYPE + OFFENSE_GROUP +
                    EXIT_YEAR + AGE_AT_EXIT + RELEASE_TYPE + STAY_DAYS + COUNT_CV +
                    MENTAL_HEALTH_SCORE + MEDICAL_SCORE + EDUCATION_SCORE + VOCATIONAL_SCORE +
                    WORK_RELEASE + ANGER_MGMT_CLASSES + CAREER_TECHNICAL_CLASSES +
                    COGNITIVE_CLASSES + INSTITUTIONAL_TRTMNT_CLASSES + LIFE_SKILLS_CLASSES +
                    PARENTING_CLASSES + RE_ENTRY_CLASSES + EDUCATION_CLASSES + COLLEGE_CLASSES +
                    EMPLOYMENT_PRE + STABLE_PRE + EMPLOYMENT_POST + STABLE_POST

3. Formatting Data for Visualization

library(gridExtra)
library(ggplot2)
library(repr)
options(repr.plot.width=10, repr.plot.height=10)

ptn = '^SEX*?'
rows <- grep(ptn, data2$newColName, perl=T)
plot1 <- ggplot(data2[rows,], aes(fill=name, y=value, x=newColName)) + 
  geom_bar(position="dodge", stat="identity") +
  theme(axis.title.y=element_blank(),axis.text.y=element_text(size=10),
        axis.title.x=element_text(size=10),axis.text.x=element_text(size=10),
        legend.title=element_blank(),legend.text=element_text(size=10) ) +
  scale_x_discrete(labels = c("Female","Male"), name = "Sex") 

ptn = '^RACE*?'
rows <- grep(ptn, data2$newColName, perl=T)
plot2 <- ggplot(data2[rows,], aes(fill=name, y=value, x=newColName)) + 
  geom_bar(position="dodge", stat="identity") +
  theme(axis.title.y=element_blank(),axis.text.y=element_text(size=10),
        axis.title.x=element_text(size=10),axis.text.x=element_text(size=10),
        legend.title=element_blank(),legend.text=element_text(size=10) ) +
  scale_x_discrete(labels = c("Asian","Black","Nat. Amer.","Unknown","White"), name = "Race"),
    guide = guide_axis(n.dodge = 2) 

ptn = '^RELEASE*?'
rows <- grep(ptn, data2$newColName, perl=T)
plot3 <- ggplot(data2[rows,], aes(fill=name, y=value, x=newColName)) + 
  geom_bar(position="dodge", stat="identity") +
  theme(axis.title.y=element_blank(),axis.text.y=element_text(size=10),
        axis.title.x=element_text(size=10),axis.text.x=element_text(size=10),
        legend.title=element_blank(),legend.text=element_text(size=10) ) +
  scale_x_discrete(labels = c("Conditional", "Max","Parole","Probation"), name = "Release Type")

ptn = '^OFFENSE*?'
rows <- grep(ptn, data2$newColName, perl=T)
plot4 <- ggplot(data2[rows,], aes(fill=name, y=value, x=newColName)) + 
  geom_bar(position="dodge", stat="identity") +
  theme(axis.title.y=element_blank(),axis.text.y=element_text(size=10),
        axis.title.x=element_text(size=10),axis.text.x=element_text(size=10),
        legend.title=element_blank(),legend.text=element_text(size=10) ) +
  scale_x_discrete(labels = c("Out-of-State","Drug","DWI",
                              "Nonviolent","Sex","Violent"), name = "Offense Group")

ptn = '^AGE*?'
rows <- grep(ptn, data2$newColName, perl=T)
plot5 <- ggplot(data2[rows,], aes(fill=name, y=value, x=newColName)) + 
  geom_bar(position="dodge", stat="identity") +
  theme(axis.title.y=element_blank(),axis.text.y=element_text(size=10),
        axis.title.x=element_text(size=10),axis.text.x=element_text(size=10),
        legend.title=element_blank(),legend.text=element_text(size=10) ) +
  scale_x_discrete(labels = c(" "), name = "Average Age At Exit")

ptn = '^STAY_DAYS*?'
rows <- grep(ptn, data2$newColName, perl=T)
plot6 <- ggplot(data2[rows,], aes(fill=name, y=value, x=newColName)) + 
  geom_bar(position="dodge", stat="identity") +
  theme(axis.title.y=element_blank(),axis.text.y=element_text(size=10),
        axis.title.x=element_text(size=10),axis.text.x=element_text(size=10),
        legend.title=element_blank(),legend.text=element_text(size=10) ) +
  scale_x_discrete(labels = c(" "), name = "Average Length of Incarceration in Days")

grid_1 <- grid.arrange(
  plot1,plot2,plot3,plot4,plot5,plot6,
  ncol=2) 

ggsave(file="grid_1.png", grid_1)

Boxplots Comparing Characteristics of Population, “Treatment” Group, and Matched Sample