Descriptives

Schools

Show the Code

# Set the path of the R-studio project
file = gsub("website", "", getwd())
# Load the economic covariates at the Nta level
econ = read_rds(paste0(file, "/02_GenData/03_dem/econ.rds"))
dem = read_rds(paste0(file, "/02_GenData/03_dem/dem.rds"))
asthma = read_rds(paste0(file, "/02_GenData/03_dem/AsthmaHosp.rds"))
# Load the location of all schools
location = read_rds(paste0(file, "/02_GenData/01_schools/location.rds"))
# Load the data set on absenteeism
absent = read_rds(paste0(file, "/02_GenData/RawAbsences.rds")) |> ungroup()

NtaAbsent = left_join(absent |> dplyr::filter(DayType == "NormalDay"), location) |>  group_by(nta, date) |> 
  summarise_at(vars(enrolled, absent, released), sum, na.rm = T)

# Load the shapefile of city boroughs
BoroShp = st_read(paste0(file, "/01_RawData/04_shp/02_bgh"), quiet = T)%>% st_transform(., crs = 4326) %>% st_simplify(., preserveTopology = T)
# Load the shapefile on city neighborhoods
NtaShp = st_read(paste0(file, "/01_RawData/04_shp/03_ngh"), quiet = T)%>% st_transform(., crs = 4326) %>% st_simplify(., preserveTopology = T)

Figure 1 in the first tab shows the location of all schools by type. Schools spread around the city, with significant clusters in South Bronx, North Manhattan, and Brooklyn. Staten Island and Queen’s distance between schools is larger than for the Bronx or Manhattan. The data set contains 856 elementary schools, 495 high schools, 379 junior high schools, 267 kindergarten to eighth-grade schools (K8), 145 secondaries, 109 kindergarten to twelfth-grade schools (K12), 2 early childhood, and two ungraded. Figure 2 portrays the median neighborhood income in 2016 (according to the American Community Survey). The city’s wealthier areas are in Manhattan, Staten Island, and West Brooklyn, with more impoverished regions in South Bronx and East Brooklyn. Figure 3 portrays the share of white persons in each neighborhood. The strong correlation between income and the share of whites is evident, with higher percentages of white persons in Manhattan and Staten Island and lower shares in the Bronx, Queens, and Brooklyn. Figure 4 contains the share of children’s hospitalization per 10,000 across all city areas. Again, there is a strong relationship between asthma hospitalizations and income, with higher hospitalization rates in the Bronx and North-Central Brooklyn and lower in Staten Island and Manhattan.

Show the Code

# Create the plot data
# Simplify the shapefile
BoroShp  = st_transform(BoroShp, crs = 4326) %>% 
  st_simplify(., preserveTopology = T)
# Transform the locations data frame into a spatial object
location = st_as_sf(location, coords = c("lon", "lat"), crs = 4326)
# Plot the map of schools in New York
tmap_style("classic"); tm_graticules(labels.inside.frame = T, labels.show = F) +
  tm_shape(BoroShp) + tm_polygons()  + tm_shape(location) +
  tm_dots(size = 0.025, col = "SchoolType", palette = "Dark2", title = "Type of School") + 
  tm_compass(type = "rose", position = c("right", "top"), size = 2) +
  tm_scale_bar(width = 0.2, position = c("right", "bottom"), text.size = 1)

Show the Code

# Create the plot data
# Simplify the shapefile
plot  = left_join(NtaShp, econ |>  mutate(MedianIncome = MedianIncome/1000))
# Plot the map of schools in New York
tmap_style("classic"); tm_graticules(labels.inside.frame = T, labels.show = F) +
  tm_shape(plot) +
  tm_fill(col = "MedianIncome", style = "quantile", n = 10, 
          title = "Median Income (Ths.)",  
          palette = "RdBu", colorNA = "black", textNA = "No Data")  +
  tm_borders(lwd = 0.5) +
  tm_compass(type = "rose", position = c("right", "top"), size = 2) +
  tm_scale_bar(width = 0.2, position = c("right", "bottom"), text.size = 1)

Figure 2: Average Income across NYC neighborhoods

Show the Code

# Create the plot data
# Simplify the shapefile
plot  = left_join(NtaShp, dem)
# Plot the map of schools in New York
tmap_style("classic"); tm_graticules(labels.inside.frame = T, labels.show = F) +
  tm_shape(plot) +
  tm_fill(col = "ShareWhite", style = "quantile", n = 10,
          title = str_wrap("Share of Persons Identifying as White", width = 25),
          palette = "RdBu", colorNA = "black", textNA = "No Data")  +
  tm_borders(lwd = 0.5) +
  tm_compass(type = "rose", position = c("right", "top"), size = 2) +
  tm_scale_bar(width = 0.2, position = c("right", "bottom"), text.size = 1)

Figure 3: Share of white persons per neighborhood

Show the Code

# Create the plot data
# Simplify the shapefile
plot  = left_join(NtaShp, asthma)
# Plot the map of schools in New York
tmap_style("classic"); tm_graticules(labels.inside.frame = T, labels.show = F) +
  tm_shape(plot) +
  tm_fill(col = "HospChildrenRate", style = "quantile", n = 10,  
          title = str_wrap("Children Hospitalization Rate (Cases per 10,000 Kids)", width = 30),
          palette = "RdBu", colorNA = "black", textNA = "No Data")  +
  tm_borders(lwd = 0.5) +
  tm_compass(type = "rose", position = c("right", "top"), size = 2) +
  tm_scale_bar(width = 0.2, position = c("right", "bottom"), text.size = 1)

Figure 4: Children hospitalization rate per neighborhood

Table 1 contains descriptive statistics on absent, released, and enrolled students. There are 3,339,158 observations across 1,854 schools. On average, there are 56 daily absences per school in the city of New York. This is equivalent to around 9.04% of enrolled students. The absences have a median of 39 and a standard deviation of 75. Concerning released students, 2 (0.40%) students are released from school on an average day. Additionally, and in line with an over-dispersed Poisson distribution, releases have a median of 0 and a standard deviation of 27 pupils per day.

Show the Code

# Show the data set
sumtable(select(ungroup(absent), absent, released, enrolled), 
         digits = 2, fixed.digits = T, 
         summ = c("mean(x)", "median(x)", "sd(x)", "min(x)", 
                  "max(x)", "notNA(x)","countNA(x)"), 
         summ.names = c("Mean", "Median", "SD", "Min", "Max", "N", "NAs"), 
         labels = c("Absent Students", "Released Students", "Enrolled Students"))

Table 1: Descriptive Statistics on School Absences
Variable	Mean	Median	SD	Min	Max	N	NAs
Absent Students	56.34	39	74.74	0	3511	3339158	0
Released Students	2.46	0	27.29	0	3777	3338834	324
Enrolled Students	622.96	497	511.04	1	5850	3339158	0

Figure 5 shows the number of schools per year reporting absences. Between 2008 and 2018, the number of reporting schools moved from a minimum of 1,545 in 2008 to a maximum of 1,659 in 2016. There are 1,365 schools in the data set that reported absenteeism for all years between 2008 and 2018. Although I use the entire sample in the regression analysis, I provide evidence that our results are robust to only focusing on the set of schools that never drop the panel.

Show the Code

# Number of schools reporting absences per year
absent |>  filter(DayType == "NormalDay") |>  
  group_by(year) |> summarise(Schools = length(unique(id))) |>  
  filter(year > 2007) |> mutate(year = as_factor(year)) %>% ggplot() + 
   geom_line(aes(x = year, y= Schools, group = 1), size = 0.75) +
  geom_point(aes(x = year, y= Schools), color = "red") +
  labs(y = "", x = "Year") +   theme_economist() %+replace%
  theme(legend.title = element_blank(), plot.background = element_blank()) +
  grids("y")

Figure 6 portrays the density function of school absences. As expected, the distribution of absences resembles a gamma probability density function with most of the density between zero and one hundred units.

Show the Code

#  Create the plot
plot = absent |> ungroup() |>  filter(DayType == "NormalDay") |> filter(absent < quantile(absent, 0.99, na.rm = T)) |> group_by(date) |> 
  summarise(absent = mean(absent, na.rm = T))

# Construct the ggplot
ggplot(plot) + 
  ggdist::stat_halfeye(aes(x = absent), height = .6,  justification = -.3,
                       .width = 0, point_colour = NA, fill = "#141f52") +
  geom_point(aes(x = absent, y = 0), size = 0.2, alpha = .2, position = position_jitter(seed = 1, height = 0.1), color = "red")+ 
   geom_boxplot(aes(x = absent), width = .25, 
                fill = "transparent", outlier.shape = NA) + 
  labs(y = "", x = "Absent Students") + theme_economist() %+replace%
  theme(legend.title = element_blank(), plot.background = element_blank()) +
  grids("y")

Figure 6: Density distribution of School Absences

Figure 7 plots the time series of school absences. Each bar represents a day between 2006 and 2018. The breaks in the time series refer to the summer and winter breaks. Overall, it is reassuring to see no extreme changes in average school absences across our sample period. Note that drastic jumps in the number of absent students often coincide with weather events. For instance, the significant increase in 2014 refers to the 2014 nor’easter and in 2012 to intense snowfall.

Show the Code

# Create the data for the plot
plot = absent |>  filter(DayType == "NormalDay") |> filter(absent < quantile(absent, 0.99, na.rm = T)) |> group_by(date, year) |>  summarise(absent = mean(absent)) |> ungroup() |> 
  mutate(RollMean = rollmean(absent, k = 5, fill = NA))

# Plot it
ggplot(plot) + 
  geom_bar(aes(x = date, y = absent, fill = as_factor(year)), size = 0.05, stat = "identity") + 
  labs(y = "Mean Absences", x = "Date") + guides(fill = "none") + theme_economist() %+replace%
  theme(legend.title = element_blank(), plot.background = element_blank()) +
  grids("y")

Figure 7: Time Series of Daily School Absences

In Figure 8, we portray the average value of school absences across months of the year. The first thing to notice is a significant increase in absences for the month before the summer holidays. Additionally, absences appear to be higher during the winter months of December to February because of weather conditions and seasonal influenza. Regarding variation within weeks (?@fig-WeeklyTS). There are more increased absences on Fridays and Mondays and lower throughout the rest of the week.

Show the Code

# Construct the data for the plot 
plot = list(`Monthly Variation` = absent |>  dplyr::filter(DayType == "NormalDay") |> 
           filter(absent < quantile(absent, 0.99, na.rm = T)) |> 
           mutate(month = month(date, label = T)) |> 
           group_by(ts = month) |>  summarise(absent = mean(absent, na.rm = T)) ,
           
           `Weekday Variation` = absent |>  dplyr::filter(DayType == "NormalDay") |> 
           filter(absent < quantile(absent, 0.99, na.rm = T)) |> 
           mutate(weekday = substr(weekday, 1,2)) |> 
           group_by(ts = weekday) |>  summarise(absent = mean(absent, na.rm = T))) %>% 
  
  rbindlist(., idcol = "window")
           
           
        
# Plot the time series
ggplot(plot) + geom_bar(aes(x = ts, y = absent, group = 1), stat = "identity", fill = "#141f52") + 
  geom_point(aes(x = ts, y = absent, group = 1), color = "red") +
  labs(y = "Mean Absences", x = "") + guides(fill = "none") + theme_economist() %+replace%
  theme(legend.title = element_blank(), plot.background = element_blank()) +
  grids("y") + facet_wrap(~window, scales = "free")

Figure 8: Monthly Variation in School Absences

Throughout the study, we aggregate the school-level data into neighborhood averages for three main reasons. First, aggregating data reduces the noise from school-specific miss-recordings and human error. Second, it is easier to match socio-demographic data from the American Community Survey. And third, it decreases the computational burden of our empirical strategy.

Table 2 contains descriptive statistics on school absences at the neighborhood level. The NTA data contains 379,120 observations across 0 neighbourhoods. On average, there are 491 absences per day, equivalent to around 8.90% of enrolled students. The absences have a median of 354 and a standard deviation of 573. Concerning released students, 9 (0.16%) students are released from school on an average day.

Show the Code

# Show the data set
tab = left_join(absent |> filter(DayType == "NormalDay"), location) |>  group_by(nta, date) |> 
  summarise_at(vars(enrolled, absent, released), sum, na.rm = T)


sumtable(select(ungroup(tab) ,absent, released, enrolled), 
         digits = 2, fixed.digits = T, 
         summ = c("mean(x)", "median(x)", "sd(x)", "min(x)", 
                  "max(x)", "notNA(x)","countNA(x)"), 
         summ.names = c("Mean", "Median", "SD", "Min", "Max", "N", "NAs"), 
         labels = c("Absent Students", "Released Students", "Enrolled Students"))

Table 2: Descriptive Statistics on School Absences at the neighborhood level
Variable	Mean	Median	SD	Min	Max	N
Absent Students	491.29	354	572.72	0	22819	379120
Released Students	8.84	0	75.01	0	3801	379120
Enrolled Students	5521.89	4746	3697.11	229	67963	379120

The next set of figures contains different descriptive statistics of school absences at the neighborhood level. First, Figure 9 has the density function of the log of absent daily students per neighborhood. Next, the map in Figure 10 includes the average number of students enrolled per NTA. Figure 11 refers to the average share of annual absences, i.e., absences/enrollment.

Show the Code

# Construct the data for the plot 
AbsentNta = left_join(absent |>  
                        filter(DayType == "NormalDay") |> 
                        filter(absent < quantile(absent, 0.99, na.rm = T)), location) |>  
  group_by(nta, ntacode,date) |> 
  summarise_at(vars(enrolled, absent, released), sum, na.rm = T) |> 
  filter(absent > 0)

# Plot the time series
ggplot(AbsentNta) + 
  stat_halfeye(aes(x = log(absent)), height = .6,  justification = -.3, .width = 0, 
               point_colour = NA, fill = "#141f52") +
  geom_point(aes(x = log(absent), y = 0), size = 0.2, alpha = .2, 
             position = position_jitter(seed = 1, height = 0.1), color = "red")+ 
   geom_boxplot(aes(x = log(absent)), width = .25, fill = "transparent", outlier.shape = NA) + 
  labs(y = "", x = "Log of Absent Students")  + theme_economist() %+replace%
  theme(legend.title = element_blank(), plot.background = element_blank()) +
  grids("y")

Figure 9: Density plot of school absences at the NTA level

Show the Code

# Construct the data for the plot 
map = AbsentNta |> group_by(ntacode) |> 
  summarise(`Avg. Enrolled Students` = mean(enrolled, na.rm = T)) %>% 
  left_join(NtaShp, .)

# Plot the map series
tmap_style("classic"); tm_graticules(labels.inside.frame = T, labels.show = F) +
  tm_shape(map) +
  tm_fill(col = "Avg. Enrolled Students", style = "quantile", n = 10,  
          palette = "RdBu", colorNA = "white", textNA = "No Data")  +
  tm_borders(lwd = 0.5) +
  tm_compass(type = "rose", position = c("right", "top"), size = 2) +
  tm_scale_bar(width = 0.2, position = c("right", "bottom"), text.size = 1)

Show the Code

# Construct the data for the plot 
map = AbsentNta |> group_by(ntacode) |> 
  summarise(`Share of Absences` = mean(absent, na.rm = T)/mean(enrolled, na.rm = T)) %>% left_join(NtaShp, .) |>  
  mutate(`Share of Absences` = `Share of Absences`*100)

# Plot the map series
tmap_style("classic"); tm_graticules(labels.inside.frame = T, labels.show = F) +
  tm_shape(map) +
  tm_fill(col = "Share of Absences", style = "quantile", n = 10,  
          palette = "RdBu", colorNA = "white", textNA = "No Data")  +
  tm_borders(lwd = 0.5) +
  tm_compass(type = "rose", position = c("right", "top"), size = 2) +
  tm_scale_bar(width = 0.2, position = c("right", "bottom"), text.size = 1)

Air Quality Index

Show the Code

# Set the path of the R-studio project
file = gsub("website", "", getwd())
aqif = read_rds(paste0(file, "/02_GenData/02_aqi/AqiForecast.rds"))
aqi = read_rds(paste0(file, "/02_GenData/02_aqi/RealAqi.rds"))

Table 3 contains descriptive statistics on the AQI and its forecast. The average and maximum forecast is slightly smaller (50.39) than the AQI (54.36). The average AQI is 54.36, slightly above the EPA threshold between good and moderate air quality.

Show the Code

# Construct the data set for the summary table
tab = left_join(filter(aqif, date < "2018-12-31", date > "2008-12-31"), aqi)
# Summary table
sumtable(select(ungroup(tab) ,aqi, RealAqi), 
         digits = 2, fixed.digits = T, 
         summ = c("mean(x)", "median(x)", "sd(x)", "min(x)", 
                  "max(x)", "notNA(x)","countNA(x)"), 
         summ.names = c("Mean", "Median", "SD", "Min", "Max", "N", "NAs"), 
         labels = c("AQI Forecast", "AQI Real"), out = "kable") %>%
  kable_classic(full_width = F, html_font = "Cambria") %>% 
  kable_styling(bootstrap_option = c("hover")) |> 
  column_spec(c(1), italic = T, color = "DarkBlue")

Table 3: Descriptive Statistics on the air quality index
Variable	Mean	Median	SD	Min	Max	N	NAs
AQI Forecast	50.39	48	18.78	4	151	3565	0
AQI Real	54.36	51	21.21	16	208	3565	0

Figure 12 shows the average daily value of the forecast. Although there is no clear evidence of a long-term trend, seasonality does appear quite relevant for the time series. Summer (and to a lesser degree winter) forecasts are consistently higher than for the other seasons. Furthermore, the probability of forecasts higher than 100 units is also higher during the summer. For instance, out of 131 alerts, one hundred and five occurred during the summer, fourteen in the spring, eight in Autumn, and only four in the winter.

Show the Code

#### Construct the data for the plot 
plot = filter(aqif, date < "2018-12-31") |>  
  mutate(season = hydroTSM::time2season(date, out.fmt = "seasons")) |> 
  mutate(season = str_to_title(season)) 


#### Plot the time series
ggplot(plot) + geom_bar(aes(x = date, y =aqi, fill = season), stat = "identity", alpha = 0.5) +
  geom_hline(aes(yintercept = 100), color = "red") + 
  labs(y = "Air Quality Index (Forecast)", x = "") +
  scale_fill_manual(values = c("Spring" = "#F97A1F", "Summer" = "#C91D42", 
                               "Winter" = "#475ED1", "Autumm"  = "#B3B3B3")) +
    theme_economist() %+replace% 
  theme(legend.title = element_blank(), plot.background = element_blank(),
        axis.title.y = element_text(vjust = 2, angle = 90))  + grids("y")

Figure 12: Mean daily value of the AQI forecast between 2008 and 2018

Figure 13 contains the average monthly, daily, and yearly value of forecast. Predicted values are significantly higher during the summer months of June to August and the Winter Months of December to January. While higher summer values are in line with higher ozone formation during the hotter months of the year because of the interaction of solar radiation with ozone precursors, higher winter measures relate to the lower efficiency of internal combustion engines at cold temperatures and the use of fossil fuels for residential heating. Even though the AQI value seems higher between 2008 and 2011 and lower after that, it is not straightforward to notice any long-term trend. Regarding weekday variation, there are slightly higher average values at the start of the week (Monday to Wednesday) and lower from Thursday to Sunday.

Show the Code

#
# Construct the data for the plot 
plot = list(Yearly = filter(aqif, date < "2018-12-31") |>  
  mutate(year = substr(year(date),3,4)) |> 
  group_by(frame = year) %>% summarise(aqi = mean(aqi, na.rm = T)), 
  
  Monthly = filter(aqif, date < "2018-12-31") |>  
  mutate(month = month(date, label = T)) |> 
  group_by(frame = month) %>% summarise(aqi = mean(aqi, na.rm = T)), 
  
  Weekly = filter(aqif, date < "2018-12-31") |>  
  mutate(weekday = weekdays(date, abbreviate = T)) |> 
  group_by(frame = weekday) %>% summarise(aqi = mean(aqi, na.rm = T))) %>% 
  
  rbindlist(., idcol = "window")

# Compute the deviation from the mean
plot = plot |>  group_by(window) |> 
  mutate(change = ((aqi -mean(aqi))/mean(aqi))*100)

# Plot the time series 
ggplot(plot) + 
  geom_bar(aes(y = aqi, x = frame, group = 1), fill = "#141f52", stat = "identity") + 
  geom_point(aes(y = aqi, x = frame, group = 1), color = "red") + 
  
  labs(x = "", y = "Air Quality Index") + facet_wrap(~window, scales = "free") +
  
  theme_economist()%+replace% 
  theme(legend.title = element_blank(), plot.background = element_blank(),
        axis.title.y = element_text(vjust = 2, angle = 90), 
        strip.text = element_text(hjust = 0)) + grids("y")

Figure 13: Mean monthly value of the AQI forecast

Figure 14 contains the density function of the AQI forecast. The forecasts appear normally distributed between zero and slightly before one hundred units. However, as we approach one hundred units, there is some clear bunching around the alert threshold, suggesting that the model used by the NYSDEC may work slightly differently when the forecast is in the neighborhood of the alert. In the empirical strategy, we exploit this bunching to identify the effects of the alert on school absences.

Show the Code

# Plot the time series 
ggplot(aqif) +  
  ggdist::stat_halfeye(aes(x = aqi), height = .6,  
                       justification = -.3,.width = 0, point_colour = NA, fill = "#141f52") +
  geom_point(aes(x = aqi, y = 0), size = 0.2, alpha = .2, 
             position = position_jitter(seed = 1, height = 0.1), color = "red") +
  geom_boxplot(aes(x = aqi), width = .15, fill = "transparent", outlier.shape = NA, color = "black") +
  
  coord_cartesian(ylim = c(-0.1, 0.8)) + theme_economist() %+replace% 
  theme(legend.title = element_blank(), axis.title.y = element_text(vjust = 2, angle = 90), 
        plot.background = element_blank()) + grids("y") + 
  labs(x = "average AQI forecast", y = "")

Figure 14: Distribution of the AQI forecast

Figure 15 contains the time series (left panel) and density distribution (right panel) of average mean daily differences between the predicted and actual AQI. With a maximum difference of 56 and a minimum of -92 AQI units there appear to be relevant differences between both time series. These differences aggregate to an average between both variables of -3.9 with a standard deviation of 15.17 units.

Show the Code

#### Data for plotting daily differences between the AQI and the forecast ####
plot = filter(aqif, date < "2018-12-31") |>  
  mutate(season = hydroTSM::time2season(date, out.fmt = "seasons")) |> 
  mutate(season = str_to_title(season)) %>% left_join(., aqi) |> 
  mutate(diff = aqi-RealAqi)

#### Time series of differences
ggplot(plot) + geom_bar(aes(x = date, y =diff, fill = season), stat = "identity", alpha = 0.5) +
  labs(y = "Differences between the AQI and its forecast", x = "") +
  scale_fill_manual(values = c("Spring" = "#F97A1F", "Summer" = "#C91D42", 
                               "Winter" = "#475ED1", "Autumn"  = "#B3B3B3")) +
  theme_economist() %+replace% 
  theme(legend.title = element_blank(), axis.title.y = element_text(vjust = 2, angle = 90), 
        plot.background = element_blank()) + grids("y")

#### Plot the density distribution of differences ####
ggplot(plot) +  
  stat_halfeye(aes(x = diff), height = .6,justification = -.3,
               .width = 0, point_colour = NA, fill = "#141F52") +
  geom_point(aes(x = diff, y = 0), size = 0.2, alpha = .1, 
             position = position_jitter(seed = 1, height = 0.1), color = "red") +
  geom_boxplot(aes(x = diff), width = .15, fill = "transparent", outlier.shape = NA, color = "black") +
  
   coord_cartesian(ylim = c(-0.1, 0.8)) + labs(x = "Difference between the AQI and its forecast", y = "") +
  
  theme_economist() %+replace% 
  theme(legend.title = element_blank(), axis.title.y = element_text(vjust = 2, angle = 90), plot.background = element_blank()) +
  grids("y")

Figure 16 portrays the average yearly, monthly, and weekday value of the AQI measure and its forecasts. The ability of the NYSDEC to predict the AQI increased between 2008 and 2017. With a mean difference in 2008 of close to four units and for 2018 of less than 0.5. It is interesting to see that the difference is lower during the summer months of June to August and higher at the start of the year between January and April. Regarding weekday differences, although the value of the AQI is consistently higher than its forecast, I cannot distinguish any pattern in the difference betweeen both time series.

Show the Code

#### Yearly Differences ####
plot = filter(aqif, date < "2018-12-31") |>  
  mutate(year = year(date)) %>% left_join(., aqi) |> 
  group_by(frame = year) %>% 
  summarise(`AQI Forecast` = mean(aqi, na.rm = T), `AQI Measure` = mean(RealAqi, na.rm = T)) %>% 
  gather(., var, value, -frame)


#### Plot it ####
ggplot(plot) + geom_line(aes(y = value, x = frame, group = var, linetype = var)) + 
  geom_point(aes(y = value, x = frame, group = 1, color = var)) + 
  
  labs(x = "", y = "") + 
  scale_color_manual(values = c("#141F52", "#c91d42")) +
  scale_x_continuous(breaks = pretty_breaks()) +
  
    theme_economist()%+replace% 
  theme(legend.title = element_blank(), plot.background = element_blank(),
        axis.title.y = element_text(vjust = 2, angle = 90)) + 
  grids("y")

#### Monthly time series ####
plot = filter(aqif, date < "2018-12-31") |>  
  mutate(month = month(date, label = T)) %>% left_join(., aqi) |> 
  group_by(frame = month) %>% 
  summarise(`AQI Forecast` = mean(aqi, na.rm = T), `AQI Measure` = mean(RealAqi, na.rm = T)) %>% 
  gather(., var, value, -frame)


#### Plot it ####
ggplot(plot) + geom_line(aes(y = value, x = frame, group = var, linetype = var)) + 
  geom_point(aes(y = value, x = frame, group = 1, color = var)) + 
  
  
  labs(x = "", y = "") + 
  scale_color_manual(values = c("#141F52", "#c91d42")) +
  
        theme_economist()%+replace% 
  theme(legend.title = element_blank(), plot.background = element_blank(),
        axis.title.y = element_text(vjust = 2, angle = 90)) + 
  grids("y")


#### Weekday differences ####
plot = filter(aqif, date < "2018-12-31") |>  
  mutate(weekday = weekdays(date)) %>% left_join(., aqi) |> 
  group_by(frame = weekday) %>% 
  summarise(`AQI Forecast` = mean(aqi, na.rm = T), `AQI Measure` = mean(RealAqi, na.rm = T)) %>% 
  gather(., var, value, -frame) %>% 
  mutate(frame = as_factor(frame)) |> 
  mutate(frame = fct_relevel(frame, c("Sunday", "Monday", "Tuesday", 
                                      "Wednesday", "Thursday", "Friday", 
                                      "Saturday")))


#### Plot it ####
ggplot(plot) + geom_line(aes(y = value, x = frame, group = var, linetype = var)) + 
  geom_point(aes(y = value, x = frame, group = 1, color = var)) +
  
  labs(x = "", y = "") + 
  scale_color_manual(values = c("#141F52", "#c91d42")) +
  
      theme_economist()%+replace% 
  theme(legend.title = element_blank(), plot.background = element_blank(),
        axis.title.y = element_text(vjust = 2, angle = 90)) + 
  grids("y")

Figure 17 contains the density distributions of the real and forecasted AQIs. Two things stand out. First, the AQI forecast has more density around its median and a smaller tail; second, there is a clear difference in the bunching of the forecast around 100 units.

Show the Code

plot = filter(aqif, date < "2018-12-31") |>  
  mutate(weekday = weekdays(date)) %>% left_join(., aqi) %>% 
  select(date, `AQI Forecast` = aqi, `AQI Measure` = RealAqi) %>%
  gather(., var, value, -date)


# Plot the time series 
ggplot(plot) +  ggdist::stat_halfeye(aes(x = value, fill = var), height = .6,  
                       justification = -.3,.width = 0, point_colour = NA, alpha = 0.5) +
  geom_boxplot(aes(x = value, color = var), width = .25, fill = "transparent", outlier.shape = NA) +
   coord_cartesian(ylim = c(-0.1, 0.8))  + 
  labs(x = "", y = "") +
  scale_fill_manual(values = c("#141F52", "#c91d42")) +
  scale_color_manual(values = c("#141F52", "#c91d42")) +
  
  theme_economist() %+replace% 
  theme(legend.title = element_blank(), plot.background = element_blank(),
        axis.title.x = element_text(vjust = -2)) + grids("y")

Figure 17: Density functions of the real and forecasted AQI values