Code
require(tidyverse)
require(kableExtra)
require(totalcensus)
require(tidycensus)
require(hrbrthemes)
require(janitor)
require(tigris)
require(sf)
require(urbnmapr)
options(scipen = 999)
The statistical software environment R was used to create this analysis. It is an open-source software for statistical computing and graphics.
The following R packages are required to produce the results generated in the blog post:
tidyverse
kableExtra
totalcensus
tidycensus
hrbrthemes
janitor
tigris
sf
urbnmapr
To install packages, use the install.packages("mypackage")
command in the R console, where "mypackage"
is the intended package. Use the following code to load the required packages.
require(tidyverse)
require(kableExtra)
require(totalcensus)
require(tidycensus)
require(hrbrthemes)
require(janitor)
require(tigris)
require(sf)
require(urbnmapr)
options(scipen = 999)
This blog post uses the following data sources:
FHFA Uniform Appraisal Dataset (UAD) Appraisal-Level Public Use File (PUF) Version 1.1
uad_puf_v1_1.rds
FHFA Duty to Serve Rural Areas data
rural2021.txt
2010 Decennial Census - Housing Units
H001001
(Automatically extracted in the code using an API query)2019_Gaz_tracts_national.txt
Simply gather the required data above and place the files in same directory as the code. The following code reads in the data and merges the files together based on census tract.
# load in the UAD Appraisal-Level Public Use File
<- readRDS("uad_puf_v1_1.rds") %>% filter(year %in% (2013:2021))
uad_puf_v1_1
# read in 2021 version of the rural areas file with 2010 tract boundaries
<- read.table(file = "rural2021.txt",
rural_lookup header = TRUE,
colClasses = c(
"character",
"character",
"character",
"character",
"character",
"character",
"character",
"character")) %>%
unite("tract_2010",
:COUNTY:TRACT,
STATEsep = "",
remove = FALSE) %>%
arrange(STATE, COUNTY, TRACT)
# deduplicate St. Louis MSA and select only desired columns
<- rural_lookup %>%
rural_lookup add_count(tract_2010) %>%
#filter(n > 1) %>%
arrange(desc(MSA2018)) %>%
distinct(tract_2010, .keep_all = TRUE) %>%
select(tract_2010, RURAL)
# gather tract level census housing units data
<- tigris::fips_codes %>%
housing_units filter(state_code < 60) %>%
pull(state_code) %>%
unique() %>%
map_dfr( ~ get_decennial(state = .,
"tract",
variables = "H001001",
year = 2010)) %>%
rename(tract_2010 = GEOID,
hous_units = value) %>%
select(tract_2010, hous_units)
# read in 2019 GAZ file
<- read.table(file = "2019_Gaz_tracts_national.txt",
gaz2019 colClasses = c("character",
"character",
"numeric",
"numeric",
"numeric",
"numeric",
"numeric",
"numeric"),
header = TRUE) %>%
select(GEOID, ALAND_SQMI) %>%
rename(tract_2010 = GEOID)
# merge the above sources to the UAD Appraisal-Level Public Use File
<- uad_puf_v1_1 %>%
uad_puf_mrg left_join(rural_lookup, by = ("tract_2010")) %>%
left_join(housing_units, by = ("tract_2010")) %>%
left_join(gaz2019, by = "tract_2010")
The following categories are filtered out of the subset used to create the charts in the blog post. Note that some might be overlapping.
Overall, 1,308,850 records remain, which is about 95% of the total records in the UAD Public Use File.
# recode variables
<- uad_puf_mrg %>%
uad_puf_mrg # compute housing density measurement and categorize appraisals by location
mutate(husqmi = hous_units / ALAND_SQMI,
location = case_when(
== "1" ~ "Rural",
RURAL == "0" & husqmi >= 16 & husqmi <= 1600 ~ "Urban (low-density)",
RURAL == "0" & husqmi > 1600 ~ "Urban (high-density)",
RURAL .default = NA),
# create a label for the number of comps variable
num_comps_label = recode(number_comparables,
"1" = "3",
"2" = "4",
"3" = "5",
"4" = "6",
"5" = "7 +",
"9" = "Missing"),
# create a 5+ comps label
five_plus_comps_label = recode(number_comparables,
"1" = "3 - 4",
"2" = "3 - 4",
"3" = "5 +",
"4" = "5 +",
"5" = "5 +",
"9" = "Missing"),
# create a numeric binary variable for 5+ comps
five_plus_comps = dplyr::recode(number_comparables,
"1" = 0,
"2" = 0,
"3" = 1,
"4" = 1,
"5" = 1))
# if there are NA values in location, replace them with Missing
$location <- uad_puf_mrg$location %>% replace_na("Missing")
uad_puf_mrg
# create the subset for charting and analysis
<- uad_puf_mrg %>%
blog_subset # non missing num_comp categories
filter(number_comparables %in% c('1', '2', '3', '4', '5')) %>%
# only want purchase and refi
filter(purpose == '1' | purpose == '2') %>%
# filter out missing values for location
filter(location != "Missing") %>%
# filter out Puerto Rico
filter(state_fips != '72')
# cast location as a factor and assign levels
$location <- factor(blog_subset$location,
blog_subsetlevels = c("Urban (high-density)",
"Urban (low-density)",
"Rural"))
# cast state_fips as a factor variable and convert to 2 letter state abbreviation
$state <- blog_subset$state_fips %>%
blog_subsetconvert_fips_to_names()
Figure 1 represents the distribution of the comparable properties used in home appraisals acquired by the Enterprises.
# chart the overall distribution
%>%
blog_subset count(num_comps_label) %>%
mutate(total_num_comps_label = sum(n),
perc = n / total_num_comps_label) %>%
ggplot(aes(x = num_comps_label, y = perc)) +
geom_col(fill = "#0570b0") +
scale_y_continuous(
labels = scales::percent,
breaks = c(0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3),
limits = c(0, 0.3)) +
labs(
title = "Figure 1: Percentage of Appraisals by Number of Comps (2013 – 2021)",
caption = paste(
paste('Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)'),
str_wrap('Note: Data have been restricted to only purchase and refinance appraisals that are not missing the “number of comps” field in the UAD PUF and can be assigned a location category. Additionally, appraisals in Puerto Rico are excluded. In total, 4.7 percent of the UAD PUF records are excluded.',125), sep = '\n'),
x = "Number of Comps",
y = "") +
theme_ipsum(
axis_title_just = "cc",
plot_title_size = 20,
axis_title_size = 16,
caption_size = 11) +
theme(
plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(margin = margin(b = 16)),
axis.title.y = element_text(margin = margin(r = 16)),
axis.title.x = element_text(margin = margin(t = 16)),
axis.text.x = element_text(size = 12, vjust = 0.5),
axis.text.y = element_text(size = 12, vjust = 0.5),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank())
Figure 2 represents the trend in the percentage of appraisals with five or more comparable properties over time.
# create a subset of the data for use in the time series chart
<- blog_subset %>%
time_series_data select(year, five_plus_comps)
# convert year to date
$year <- lubridate::ymd(time_series_data$year, truncated = 2L)
time_series_data
# compute the percent of 5+ comps by year, chart that percent from 2013 - 2021
%>%
time_series_data count(year, five_plus_comps) %>%
group_by(year) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
ungroup() %>%
mutate(perc = n / total) %>%
filter(five_plus_comps == 1) %>%
transmute(year = year(year), perc) %>%
ggplot(aes(x=year, y=perc)) +
geom_line(color="#0570b0", size = 1) +
geom_point(color ="#0570b0", size=2.5) +
scale_x_continuous(breaks = 2013:2021) +
scale_y_continuous(labels = scales::percent,
limits = c(0, 0.9),
breaks = seq(0, 0.9, 0.1),
expand = expansion(mult = 0)) +
labs(
title = "Figure 2: Percentage of Appraisals with Five or More Comps (2013 - 2021)",
caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)',
x = "Year",
y = "") +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.title = element_text(margin = margin(b = 30), hjust = 0.5),
plot.subtitle = element_text(margin = margin(b = 16)),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
axis.text = ggplot2::element_text(),
axis.text.x = ggplot2::element_text(size = 12, vjust = 0.5,
margin = ggplot2::margin(t = 4L)),
axis.text.y = ggplot2::element_text(size = 12, hjust = 1),
axis.text.x.top = NULL,
axis.text.y.right = NULL,
axis.ticks.length.x = NULL,
axis.ticks.length.x.top = NULL,
axis.ticks.length.x.bottom = NULL,
axis.ticks.length.y = NULL,
axis.ticks.length.y.left = NULL,
axis.ticks.length.y.right = NULL,
axis.title = ggplot2::element_text(face = "italic"),
axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 16)),
axis.title.y = ggplot2::element_text(angle = 90L,
margin = ggplot2::margin(r = 16)),
axis.title.x.top = NULL,
axis.title.y.right = NULL,
axis.ticks = ggplot2::element_line(),
axis.ticks.length = ggplot2::unit(4L, "pt"),
axis.ticks.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.ticks.y = ggplot2::element_blank(),
axis.line = ggplot2::element_line(),
axis.line.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.line.y = ggplot2::element_blank())
Figure 3a shows each state’s percentage of appraisals with five or more comparable properties in descending rank order. Figure 3b displays a corresponding map by state.
# chart the percent of appraisals with 5+ comps by state
%>%
blog_subset count(state, five_plus_comps) %>%
group_by(state) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
ungroup() %>%
mutate(perc = n / total) %>%
filter(five_plus_comps == 1) %>%
arrange(perc) %>%
mutate(state = factor(state, levels = .$state)) %>%
ggplot(aes(perc, state)) +
geom_segment(aes(x = 0, xend = perc, y = state, yend = state)) +
geom_point(color="#0570b0", size=2.5) +
scale_x_continuous(labels = scales::percent, expand = expansion(mult = c(0, 0)), limits = c(0, 1)) +
labs(
title = "Figure 3a: Percentage of Appraisals with Five or More Comps by State (2013 - 2021)",
x = " ",
y = " ",
caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)') +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
axis.text = ggplot2::element_text(),
axis.text.x = ggplot2::element_text(size = 12, vjust = 0.5,
margin = ggplot2::margin(t = 4L)),
axis.text.y = ggplot2::element_text(size = 10.5, hjust = 1),
axis.text.x.top = NULL,
axis.text.y.right = NULL,
axis.ticks.length.x = NULL,
axis.ticks.length.x.top = NULL,
axis.ticks.length.x.bottom = NULL,
axis.ticks.length.y = NULL,
axis.ticks.length.y.left = NULL,
axis.ticks.length.y.right = NULL,
axis.title = ggplot2::element_text(face = "italic"),
axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 16)),
axis.title.y = ggplot2::element_text(angle = 90L,
margin = ggplot2::margin(r = 16)),
axis.title.x.top = NULL,
axis.title.y.right = NULL,
axis.ticks = ggplot2::element_line(),
axis.ticks.length = ggplot2::unit(4L, "pt"),
axis.ticks.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.ticks.y = ggplot2::element_blank(),
axis.line = ggplot2::element_line(),
axis.line.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.line.y = ggplot2::element_blank())
# get state geometry
<- tigris::states(progress_bar = FALSE) %>% select(STUSPS, geometry)
states <- shift_geometry(st_transform(states, "ESRI:102003"))
state_geometry
# compute percent of appraisals with 5+ comps by state
<- blog_subset %>%
state_data count(state, five_plus_comps) %>%
group_by(state) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
ungroup() %>%
mutate(perc = n / total) %>%
filter(five_plus_comps == 1) %>%
arrange(perc) %>%
mutate(state = factor(state, levels = .$state))
# join the percentage on the state geometry
<- inner_join(state_geometry, state_data, join_by(STUSPS == state))
mapdata
# create a map of the percent of appraisals with 5+ comps by state
%>%
mapdata ggplot() +
geom_sf(mapping = aes(fill = perc),
color = "white") +
scale_fill_gradient(
low = "#F3F3F3",
high = "#4080bf",
breaks = c(0, 0.5, 1),
labels = c("0%", "50%", "100%"),
limits = c(0, 1)) +
labs(
fill = NULL,
title = "Figure 3b: Percentage of Appraisals with Five or More Comps by State (2013 - 2021)",
caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)') +
geom_sf_text(data = get_urbn_labels(map = "states", sf = TRUE),
aes(label = state_abbv),
size = 3) +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.margin = unit(c(1, 2, 1, 1), "cm"),
legend.position = c(0.97, 0.45),
plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
legend.text=element_text(size=11),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank())
Figure 4 below shows how the number of comparable properties varies by high-density urban areas, low-density urban areas, and rural areas.
# create a table of the count of appraisals per location category
%>%
blog_subset count(location) %>%
mutate(total = sum(n, na.rm = TRUE),
Percent = n / total) %>%
mutate(across(n, scales::label_comma())) %>%
mutate(across(Percent, scales::label_percent())) %>%
rename(Appraisals=n) %>%
select(!total) %>%
kbl() %>%
kable_styling(bootstrap_options = c("bordered", "hover", full_width = F))
location | Appraisals | Percent |
---|---|---|
Urban (high-density) | 330,582 | 25.3% |
Urban (low-density) | 752,686 | 57.5% |
Rural | 225,582 | 17.2% |
# chart the percent of appraisals with 5+ comps by location category
%>%
blog_subset ggplot(aes(x = five_plus_comps_label,
y = after_stat(prop),
fill = location,
group = location)) +
geom_bar(position = position_dodge()) +
scale_fill_manual(values = c("#74a9cf", "#2b8cbe", "#045a8d")) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 0.8, 0.1), limits = c(0,0.8)) +
labs(
title = "Figure 4: Percentage of Appraisals with Five or More Comps (2013 - 2021)",
caption = paste(
paste('Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)'),
str_wrap('Note: Rural tracts assigned based on FHFA Duty to Serve definitions. For non-rural tracts, low-density is defined as having 16-1,600 housing units per square mile. High-density urban areas are non-rural tracts with greater than 1,600 housing units per square mile. Rural: 225,582 appraisals (17.2%); Urban (low-density): 752,686 appraisals (57.5%); Urban (high-density): 330,582 appraisals (25.3%).',125), sep = '\n'),
x = "Number of Comps",
y = "",
fill = "") +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
axis.title.y = element_text(margin = margin(r = 16)),
axis.title.x = element_text(margin = margin(t = 16)),
axis.text.x = element_text(size = 12, vjust = 0.5),
axis.text.y = element_text(size = 12, vjust = 0.5),
legend.text=element_text(size=11),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank())