README: UAD PUF Number of Comps Blog

Author

Sean Cannon and Will Fischler

Overview

The statistical software environment R was used to create this analysis. It is an open-source software for statistical computing and graphics.

The following R packages are required to produce the results generated in the blog post:

tidyverse
kableExtra
totalcensus
tidycensus
hrbrthemes
janitor
tigris
sf
urbnmapr

To install packages, use the install.packages("mypackage") command in the R console, where "mypackage" is the intended package. Use the following code to load the required packages.

Code

require(tidyverse)
require(kableExtra)
require(totalcensus)
require(tidycensus)
require(hrbrthemes)
require(janitor)
require(tigris)
require(sf)
require(urbnmapr)

options(scipen = 999)

Data

This blog post uses the following data sources:

FHFA Uniform Appraisal Dataset (UAD) Appraisal-Level Public Use File (PUF) Version 1.1
- uad_puf_v1_1.rds
FHFA Duty to Serve Rural Areas data
- rural2021.txt
2010 Decennial Census - Housing Units
- H001001 (Automatically extracted in the code using an API query)
2019 Census Gazetteer Files
- 2019_Gaz_tracts_national.txt

Simply gather the required data above and place the files in same directory as the code. The following code reads in the data and merges the files together based on census tract.

Code

# load in the UAD Appraisal-Level Public Use File
uad_puf_v1_1 <- readRDS("uad_puf_v1_1.rds") %>% filter(year %in% (2013:2021))

# read in 2021 version of the rural areas file with 2010 tract boundaries
rural_lookup <- read.table(file = "rural2021.txt",
                           header = TRUE,
                           colClasses = c(
                             "character",
                             "character",
                             "character",
                             "character",
                             "character",
                             "character",
                             "character",
                             "character")) %>%
  unite("tract_2010",
        STATE:COUNTY:TRACT,
        sep = "",
        remove = FALSE) %>%
  arrange(STATE, COUNTY, TRACT)

# deduplicate St. Louis MSA and select only desired columns
rural_lookup <- rural_lookup %>%
  add_count(tract_2010) %>%
  #filter(n > 1) %>%
  arrange(desc(MSA2018)) %>%
  distinct(tract_2010, .keep_all = TRUE) %>%
  select(tract_2010, RURAL)

# gather tract level census housing units data
housing_units <- tigris::fips_codes %>%
  filter(state_code < 60) %>%
  pull(state_code) %>%
  unique() %>%
  map_dfr( ~ get_decennial(state = .,
                           "tract",
                           variables = "H001001",
                           year = 2010)) %>%
  rename(tract_2010 = GEOID,
         hous_units = value) %>%
  select(tract_2010, hous_units)

# read in 2019 GAZ file
gaz2019 <- read.table(file = "2019_Gaz_tracts_national.txt",
                      colClasses = c("character",
                                     "character",
                                     "numeric",
                                     "numeric",
                                     "numeric",
                                     "numeric",
                                     "numeric",
                                     "numeric"),
                      header = TRUE) %>%
  select(GEOID, ALAND_SQMI) %>%
  rename(tract_2010 = GEOID)

# merge the above sources to the UAD Appraisal-Level Public Use File
uad_puf_mrg <- uad_puf_v1_1 %>%
  left_join(rural_lookup, by = ("tract_2010")) %>%
  left_join(housing_units, by = ("tract_2010")) %>%
  left_join(gaz2019, by = "tract_2010")

Subset for Visualization

The following categories are filtered out of the subset used to create the charts in the blog post. Note that some might be overlapping.

Appraisals missing the number of comparable properties variable
Appraisals not categorized as a home purchase or refinance transaction
Appraisals that cannot be assigned a location category
Appraisals in Puerto Rico

Overall, 1,308,850 records remain, which is about 95% of the total records in the UAD Public Use File.

Code

# recode variables
uad_puf_mrg <- uad_puf_mrg %>%
  # compute housing density measurement and categorize appraisals by location
  mutate(husqmi = hous_units / ALAND_SQMI,
         location = case_when(
           RURAL == "1" ~ "Rural",
           RURAL == "0" & husqmi >= 16 & husqmi <= 1600 ~ "Urban (low-density)",
           RURAL == "0" & husqmi > 1600 ~ "Urban (high-density)",
           .default = NA),
         # create a label for the number of comps variable
         num_comps_label = recode(number_comparables,    
                                  "1" = "3",
                                  "2" = "4",
                                  "3" = "5",
                                  "4" = "6",
                                  "5" = "7 +",
                                  "9" = "Missing"),
         # create a 5+ comps label
         five_plus_comps_label = recode(number_comparables,
                                        "1" = "3 - 4",
                                        "2" = "3 - 4",
                                        "3" = "5 +",
                                        "4" = "5 +",
                                        "5" = "5 +",
                                        "9" = "Missing"),
         # create a numeric binary variable for 5+ comps
         five_plus_comps = dplyr::recode(number_comparables,
                                         "1" = 0,
                                         "2" = 0,
                                         "3" = 1,
                                         "4" = 1,
                                         "5" = 1))

# if there are NA values in location, replace them with Missing
uad_puf_mrg$location <- uad_puf_mrg$location %>% replace_na("Missing")

# create the subset for charting and analysis
blog_subset <- uad_puf_mrg %>%
  # non missing num_comp categories
  filter(number_comparables %in%  c('1', '2', '3', '4', '5')) %>%
  # only want purchase and refi
  filter(purpose == '1' | purpose == '2') %>%
  # filter out missing values for location
  filter(location != "Missing") %>%
  # filter out Puerto Rico
  filter(state_fips != '72')                                       

# cast location as a factor and assign levels
blog_subset$location <- factor(blog_subset$location,
                               levels = c("Urban (high-density)",
                                          "Urban (low-density)",
                                          "Rural"))

# cast state_fips as a factor variable and convert to 2 letter state abbreviation
blog_subset$state <- blog_subset$state_fips %>%
  convert_fips_to_names()

Overall Distribution

Figure 1 represents the distribution of the comparable properties used in home appraisals acquired by the Enterprises.

Code

# chart the overall distribution
blog_subset %>%
  count(num_comps_label) %>%
  mutate(total_num_comps_label = sum(n),
         perc = n / total_num_comps_label) %>%
  ggplot(aes(x = num_comps_label, y = perc)) +
  geom_col(fill = "#0570b0") +
  scale_y_continuous(
    labels = scales::percent,
    breaks = c(0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3),
    limits = c(0, 0.3)) +
  labs(
    title = "Figure 1: Percentage of Appraisals by Number of Comps (2013 – 2021)",
    caption = paste(
      paste('Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)'),
      str_wrap('Note: Data have been restricted to only purchase and refinance appraisals that are not missing the “number of comps” field in the UAD PUF and can be assigned a location category. Additionally, appraisals in Puerto Rico are excluded. In total, 4.7 percent of the UAD PUF records are excluded.',125), sep = '\n'),
    x = "Number of Comps",
    y = "") +
  theme_ipsum(
    axis_title_just = "cc",
    plot_title_size = 20,
    axis_title_size = 16,
    caption_size = 11) +
  theme(
    plot.title = element_text(margin = margin(b = 10)),
    plot.subtitle = element_text(margin = margin(b = 16)),
    axis.title.y = element_text(margin = margin(r = 16)),
    axis.title.x = element_text(margin = margin(t = 16)),
    axis.text.x = element_text(size = 12, vjust = 0.5),
    axis.text.y = element_text(size = 12, vjust = 0.5),
    plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank())

Time Series

Figure 2 represents the trend in the percentage of appraisals with five or more comparable properties over time.

Code

# create a subset of the data for use in the time series chart
time_series_data <- blog_subset %>%
  select(year, five_plus_comps)

# convert year to date
time_series_data$year <- lubridate::ymd(time_series_data$year, truncated = 2L)

# compute the percent of 5+ comps by year, chart that percent from 2013 - 2021
time_series_data %>% 
  count(year, five_plus_comps) %>% 
  group_by(year) %>% 
  mutate(total = sum(n, na.rm = TRUE)) %>% 
  ungroup() %>% 
  mutate(perc = n / total) %>% 
  filter(five_plus_comps == 1) %>% 
  transmute(year = year(year), perc) %>%
  ggplot(aes(x=year, y=perc)) +
  geom_line(color="#0570b0", size = 1) +
  geom_point(color ="#0570b0", size=2.5) +
  scale_x_continuous(breaks = 2013:2021) + 
  scale_y_continuous(labels = scales::percent, 
                     limits = c(0, 0.9), 
                     breaks = seq(0, 0.9, 0.1), 
                     expand = expansion(mult = 0)) +
    labs(
    title = "Figure 2: Percentage of Appraisals with Five or More Comps (2013 - 2021)",
    caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)',
    x = "Year",
    y = "") +
  theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16,  caption_size = 11) +
  theme(plot.title = element_text(margin = margin(b = 30), hjust = 0.5),
        plot.subtitle = element_text(margin = margin(b = 16)),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
        axis.text = ggplot2::element_text(),
        axis.text.x = ggplot2::element_text(size = 12, vjust = 0.5,
                                            margin = ggplot2::margin(t = 4L)),
        axis.text.y = ggplot2::element_text(size = 12, hjust = 1),
        axis.text.x.top = NULL,
        axis.text.y.right = NULL,
        axis.ticks.length.x = NULL,
        axis.ticks.length.x.top = NULL,
        axis.ticks.length.x.bottom = NULL,
        axis.ticks.length.y = NULL,
        axis.ticks.length.y.left = NULL,
        axis.ticks.length.y.right = NULL,
        axis.title = ggplot2::element_text(face = "italic"),
        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 16)),
        axis.title.y = ggplot2::element_text(angle = 90L,
                                             margin = ggplot2::margin(r = 16)),
        axis.title.x.top = NULL,
        axis.title.y.right = NULL,
        axis.ticks = ggplot2::element_line(),
        axis.ticks.length = ggplot2::unit(4L, "pt"),
        axis.ticks.x = ggplot2::element_line(colour = NULL,
                                             size = NULL,
                                             linetype = NULL,
                                             lineend = NULL),
        axis.ticks.y = ggplot2::element_blank(),
        axis.line = ggplot2::element_line(),
        axis.line.x = ggplot2::element_line(colour = NULL,
                                            size = NULL,
                                            linetype = NULL,
                                            lineend = NULL),
        axis.line.y = ggplot2::element_blank())

State

Figure 3a shows each state’s percentage of appraisals with five or more comparable properties in descending rank order. Figure 3b displays a corresponding map by state.

Code

# chart the percent of appraisals with 5+ comps by state
blog_subset %>%
  count(state, five_plus_comps) %>% 
  group_by(state) %>% 
  mutate(total = sum(n, na.rm = TRUE)) %>% 
  ungroup() %>% 
  mutate(perc = n / total) %>% 
  filter(five_plus_comps == 1) %>%
  arrange(perc) %>%
  mutate(state = factor(state, levels = .$state)) %>%
  ggplot(aes(perc, state)) +
  geom_segment(aes(x = 0, xend = perc, y = state, yend = state)) + 
  geom_point(color="#0570b0", size=2.5) + 
  scale_x_continuous(labels = scales::percent, expand = expansion(mult = c(0, 0)), limits = c(0, 1)) +
  labs(
    title = "Figure 3a: Percentage of Appraisals with Five or More Comps by State (2013 - 2021)",
    x = " ", 
    y = " ",
    caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)') +
  theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16,  caption_size = 11) + 
  theme(plot.title = element_text(margin = margin(b = 10)),
        plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
        axis.text = ggplot2::element_text(),
        axis.text.x = ggplot2::element_text(size = 12, vjust = 0.5,
                                            margin = ggplot2::margin(t = 4L)),
        axis.text.y = ggplot2::element_text(size = 10.5, hjust = 1),
        axis.text.x.top = NULL,
        axis.text.y.right = NULL,
        axis.ticks.length.x = NULL,
        axis.ticks.length.x.top = NULL,
        axis.ticks.length.x.bottom = NULL,
        axis.ticks.length.y = NULL,
        axis.ticks.length.y.left = NULL,
        axis.ticks.length.y.right = NULL,
        axis.title = ggplot2::element_text(face = "italic"),
        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 16)),
        axis.title.y = ggplot2::element_text(angle = 90L,
                                             margin = ggplot2::margin(r = 16)),
        axis.title.x.top = NULL,
        axis.title.y.right = NULL,
        axis.ticks = ggplot2::element_line(),
        axis.ticks.length = ggplot2::unit(4L, "pt"),
        axis.ticks.x = ggplot2::element_line(colour = NULL,
                                             size = NULL,
                                             linetype = NULL,
                                             lineend = NULL),
        axis.ticks.y = ggplot2::element_blank(),
        
        axis.line = ggplot2::element_line(),
        axis.line.x = ggplot2::element_line(colour = NULL,
                                            size = NULL,
                                            linetype = NULL,
                                            lineend = NULL),
        axis.line.y = ggplot2::element_blank())

Code

# get state geometry
states <- tigris::states(progress_bar = FALSE) %>% select(STUSPS, geometry)
state_geometry <- shift_geometry(st_transform(states, "ESRI:102003"))

# compute percent of appraisals with 5+ comps by state
state_data <- blog_subset %>%
  count(state, five_plus_comps) %>%
  group_by(state) %>%
  mutate(total = sum(n, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(perc = n / total) %>%
  filter(five_plus_comps == 1) %>%
  arrange(perc) %>%
  mutate(state = factor(state, levels = .$state))

# join the percentage on the state geometry
mapdata <- inner_join(state_geometry, state_data, join_by(STUSPS == state))

# create a map of the percent of appraisals with 5+ comps by state
mapdata %>%
  ggplot() +
  geom_sf(mapping = aes(fill = perc),
          color = "white") +
  scale_fill_gradient(
    low = "#F3F3F3",
    high = "#4080bf",
    breaks = c(0, 0.5, 1),
    labels = c("0%", "50%", "100%"),
    limits = c(0, 1)) +
  labs(
    fill = NULL,
    title = "Figure 3b: Percentage of Appraisals with Five or More Comps by State (2013 - 2021)",
    caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)') +
  geom_sf_text(data = get_urbn_labels(map = "states", sf = TRUE),
               aes(label = state_abbv),
               size = 3) +
  theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16,  caption_size = 11) +
  theme(plot.margin = unit(c(1, 2, 1, 1), "cm"),
        legend.position = c(0.97, 0.45),
        plot.title = element_text(margin = margin(b = 10)),
        plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
        legend.text=element_text(size=11),
        plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank())

Location

Figure 4 below shows how the number of comparable properties varies by high-density urban areas, low-density urban areas, and rural areas.

Code

# create a table of the count of appraisals per location category
blog_subset %>%
  count(location) %>%
  mutate(total = sum(n, na.rm = TRUE),
         Percent = n / total) %>%
  mutate(across(n, scales::label_comma())) %>%
  mutate(across(Percent, scales::label_percent())) %>%
  rename(Appraisals=n) %>%
  select(!total) %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("bordered", "hover", full_width = F))

location	Appraisals	Percent
Urban (high-density)	330,582	25.3%
Urban (low-density)	752,686	57.5%
Rural	225,582	17.2%

Code

# chart the percent of appraisals with 5+ comps by location category
blog_subset %>% 
  ggplot(aes(x = five_plus_comps_label, 
             y = after_stat(prop), 
             fill = location,
             group = location)) + 
  geom_bar(position = position_dodge()) +
  scale_fill_manual(values = c("#74a9cf", "#2b8cbe", "#045a8d")) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 0.8, 0.1), limits = c(0,0.8)) +
  labs(
    title = "Figure 4: Percentage of Appraisals with Five or More Comps (2013 - 2021)",
    caption = paste(
      paste('Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)'),
      str_wrap('Note: Rural tracts assigned based on FHFA Duty to Serve definitions. For non-rural tracts, low-density is defined as having 16-1,600 housing units per square mile. High-density urban areas are non-rural tracts with greater than 1,600 housing units per square mile. Rural: 225,582 appraisals (17.2%); Urban (low-density): 752,686 appraisals (57.5%); Urban (high-density): 330,582 appraisals (25.3%).',125), sep = '\n'),
    x = "Number of Comps",
    y = "",
    fill = "") +
  theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16,  caption_size = 11) +
  theme(plot.title = element_text(margin = margin(b = 10)),
        plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
        axis.title.y = element_text(margin = margin(r = 16)),
        axis.title.x = element_text(margin = margin(t = 16)),
        axis.text.x = element_text(size = 12, vjust = 0.5),
        axis.text.y = element_text(size = 12, vjust = 0.5),
        legend.text=element_text(size=11),
        plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank())