Problem statement

In order to illustrate how you can easily apply functions to segments of your data, let’s use actual data to answer a very real question in financial markets.

Hopefully this illustration will help you think of ways that you can apply the split-apply-combine paradigm.

Question:

  • You’ve been approached by an asset manager to do an analysis related to how similar active manager returns are relative to the local benchmark (FTSE/JSE Capped SWIX).

  • You’ve been provided with actual fund manager returns (names hidden) and the benchmark returns below.

  • Build a function that runs a rolling N-month regression of each manager’s returns onto the local benchmark - from this, determine the average rolling N-month Beta of each manager to the local benchmark.

Hopefully you can illustrate, simply, how high the aggregate beta of local managers are (i.e. how much of each manager’s returns is determined by the market).

Steps:

  1. Load data:
# Data:
library(tidyverse);library(fmxdat)
active_mans <- fmxdat::ASISA
BM <- fmxdat::Local_Indexes %>% filter(Tickers %in% "J433")
  1. Build function that runs a rolling N-month regression for each fund on the local benchmark.
  1. First, split your dataframe so that each manager’s return is its own list.

To do this, use group_split

Manager_list <- active_mans %>% group_split(FundName)
# E.g., now see:
# Manager_list[[1]]
# Manager Count:
length(Manager_list)
## [1] 252
  1. Build a regression model that runs a rolling N-period regression of fund manager returns onto the benchmark:
# Its not easy to build code on lists. 
# Let's build the function to handle one manager before we map all managers onto the function

Manager <- Manager_list[[1]]

# Now, in order to do rolling analyses, let's use fmxdat's useful 
# call to split your date vector into Rolling N period blocks:

Alldates <- Manager %>% select(date) %>% unique %>% arrange(date)

# Let's do Rolling 36 month returns:
Nsel = 36
Roll_Dates <- Alldates %>% pull(date) %>% fmxdat::Date_Roller(., N = Nsel)

# Note what fmxdat::Date_Roller now does:
Roll_Dates[1:3]
## [[1]]
##  [1] "2006-03-31" "2006-04-30" "2006-05-31" "2006-06-30" "2006-07-31"
##  [6] "2006-08-31" "2006-09-30" "2006-10-31" "2006-11-30" "2006-12-31"
## [11] "2007-01-31" "2007-02-28" "2007-03-31" "2007-04-30" "2007-05-31"
## [16] "2007-06-30" "2007-07-31" "2007-08-31" "2007-09-30" "2007-10-31"
## [21] "2007-11-30" "2007-12-31" "2008-01-31" "2008-02-29" "2008-03-31"
## [26] "2008-04-30" "2008-05-31" "2008-06-30" "2008-07-31" "2008-08-31"
## [31] "2008-09-30" "2008-10-31" "2008-11-30" "2008-12-31" "2009-01-31"
## [36] "2009-02-28"
## 
## [[2]]
##  [1] "2006-04-30" "2006-05-31" "2006-06-30" "2006-07-31" "2006-08-31"
##  [6] "2006-09-30" "2006-10-31" "2006-11-30" "2006-12-31" "2007-01-31"
## [11] "2007-02-28" "2007-03-31" "2007-04-30" "2007-05-31" "2007-06-30"
## [16] "2007-07-31" "2007-08-31" "2007-09-30" "2007-10-31" "2007-11-30"
## [21] "2007-12-31" "2008-01-31" "2008-02-29" "2008-03-31" "2008-04-30"
## [26] "2008-05-31" "2008-06-30" "2008-07-31" "2008-08-31" "2008-09-30"
## [31] "2008-10-31" "2008-11-30" "2008-12-31" "2009-01-31" "2009-02-28"
## [36] "2009-03-31"
## 
## [[3]]
##  [1] "2006-05-31" "2006-06-30" "2006-07-31" "2006-08-31" "2006-09-30"
##  [6] "2006-10-31" "2006-11-30" "2006-12-31" "2007-01-31" "2007-02-28"
## [11] "2007-03-31" "2007-04-30" "2007-05-31" "2007-06-30" "2007-07-31"
## [16] "2007-08-31" "2007-09-30" "2007-10-31" "2007-11-30" "2007-12-31"
## [21] "2008-01-31" "2008-02-29" "2008-03-31" "2008-04-30" "2008-05-31"
## [26] "2008-06-30" "2008-07-31" "2008-08-31" "2008-09-30" "2008-10-31"
## [31] "2008-11-30" "2008-12-31" "2009-01-31" "2009-02-28" "2009-03-31"
## [36] "2009-04-30"

From the above chunk - note that Date_Roller split my date vector into lists, where each subsequent entry is the previous 36 month date vector with the new date added and the first subtracted.

Let’s use the above now to calculate a rolling regression function:

# Let's take the first Roll_Dates:
Dsel <- Roll_Dates[[1]]

# Now filter date to be only in Dsel:
df_select <- Manager %>% filter(date %in% Dsel)
  1. Make BM returns monthly

Before we can do regressions - the benchmark and portfolio returns need to have the same periodicity.

Let’s make the daily returns monthly below:

# Go from returns to index:

BM_Monthly <- 
BM %>% 
  # Below is the chained returns to make it an index:
  mutate( Index = cumprod(1+Returns)) %>% 
  # Now filter to just be end of month:
  mutate(Yearmonth = format(date, "%Y%B")) %>% 
  group_by(Yearmonth) %>% 
  filter(date == last(date)) %>% 
  ungroup() %>% arrange(date) %>% 
  # Now calculate returns for month to month:
  mutate(BM_Monthly = Index / lag(Index) - 1) %>% slice(-1)

# Let's ensure the same end of month dates are in manager returns and benchmark
# (i.e. important that you don't have 28 Feb in one, and 29 Feb in another...)

# Let's see if there are issues:
if( !any( df_select$date %in% BM_Monthly$date) ) stop(glue::glue("Some months for manager not in BM: {unique(df_select$FundName)}"))
if( !any( BM_Monthly$date %in% df_select$date) ) stop(glue::glue("Some months for manager not in BM: {unique(df_select$FundName)}"))
  1. Right, let’s now build that Beta model!

Regress 36 month manager returns on the benchmark, and pull the beta:

dfreg <- left_join(df_select, BM_Monthly %>% select(date, BM_Monthly) %>% unique, by = "date")
if(nrow(dfreg) != Nsel) stop(glue::glue("Dates for {unique(df_select$FundName)} not equal to 36 at start date: {Dsel[1]}"))

# Regression spec:
lmreg <- lm(Returns~BM_Monthly, data = dfreg)

# Pull Beta and create a table for the manager:
Output <- 
  lmreg %>% broom::tidy() %>% select(term, coef = estimate) %>% 
  mutate(term = ifelse(grepl("Intercept", term), "Alpha",
                       ifelse(grepl("BM_M", term), "Beta", NA))) %>% 
  mutate(Manager = unique(dfreg$FundName), date = last(Dsel))
  

Output
## # A tibble: 2 × 4
##   term     coef Manager date      
##   <chr>   <dbl> <chr>   <date>    
## 1 Alpha 0.00115 Fund_1  2009-02-28
## 2 Beta  0.820   Fund_1  2009-02-28

Right, so the above achieves what we want to do. The next step is now to apply this to every manager, and every date list vector.

This is where the power of mapping comes into play!

map, map_df, …

Map applies a function to each element of a list.

For a simple example to illustrate, let’s say you have the following dataframe:

fmxdat::Commodities 
## # A tibble: 1,560 × 4
##    date       Name       Zar_value  returns
##    <date>     <chr>          <dbl>    <dbl>
##  1 1990-02-28 Bcom_Index      240.  0.0124 
##  2 1990-03-30 Bcom_Index      250.  0.0411 
##  3 1990-04-30 Bcom_Index      250.  0.00127
##  4 1990-05-31 Bcom_Index      245. -0.0208 
##  5 1990-06-29 Bcom_Index      242. -0.0115 
##  6 1990-07-31 Bcom_Index      244.  0.00813
##  7 1990-08-31 Bcom_Index      264.  0.0821 
##  8 1990-09-28 Bcom_Index      299.  0.130  
##  9 1990-10-31 Bcom_Index      278. -0.0679 
## 10 1990-11-30 Bcom_Index      261. -0.0627 
## # ℹ 1,550 more rows

…now suppose I want to calculate the proportion of returns above and below 5% and -5%.

This means I want to apply a funtion to each Name, that counts those observations.

Using map, it can be done as follows:

split_df <- fmxdat::Commodities %>% group_split(Name)

minmax_function <- function(split_df){
  # For development, I like to run the following line:
  # df = split_df[[1]]  
df = split_df
result <- df %>% summarise(N_above = sum(returns > 0.05)/n(), N_below = sum(returns < -0.05)/n(), Name = unique(df$Name))
result
}

# So testing it on the first group, let's check the function works:
minmax_function(split_df[[1]])
## # A tibble: 1 × 3
##   N_above N_below Name      
##     <dbl>   <dbl> <chr>     
## 1   0.146   0.103 Bcom_Index

…now the power of map comes through, as I can map through each element of my list and apply the above function:

# Note the use of the tilde (~):
split_df %>% map( ~minmax_function(.))
## [[1]]
## # A tibble: 1 × 3
##   N_above N_below Name      
##     <dbl>   <dbl> <chr>     
## 1   0.146   0.103 Bcom_Index
## 
## [[2]]
## # A tibble: 1 × 3
##   N_above N_below Name 
##     <dbl>   <dbl> <chr>
## 1   0.177   0.115 Gold 
## 
## [[3]]
## # A tibble: 1 × 3
##   N_above N_below Name     
##     <dbl>   <dbl> <chr>    
## 1   0.326   0.215 Oil_Brent
## 
## [[4]]
## # A tibble: 1 × 3
##   N_above N_below Name 
##     <dbl>   <dbl> <chr>
## 1   0.246   0.151 Plat

…the above returns a list of all the answers. It is bindable, however - let’s see how we can combine it into a single df:

result <- split_df %>% map_df( ~minmax_function(.))

result
## # A tibble: 4 × 3
##   N_above N_below Name      
##     <dbl>   <dbl> <chr>     
## 1   0.146   0.103 Bcom_Index
## 2   0.177   0.115 Gold      
## 3   0.326   0.215 Oil_Brent 
## 4   0.246   0.151 Plat

The above principle is incredibly powerful.

Returning to our earlier rolling beta example

Right, so you’ve now seen the principle of maps.

Let’s return to our earlier example and apply the principle.

Notice that when binding results using map_df - it is important to anticipate issues and deal with it using the NULL principle.

… when binding results, if a result is invalid - make it return NULL, which can be bound to anything.

See example below as illustration!

NOTE:

We can do this far quicker (using various means, and also parallel processing) - for now, it is about learning the principle of applying functions to list elements using maps:

# We had our first list: the managers:  Manager_list

# ...and then within the function I want to map across the relevant dates for that manager.

# So its a map within a map!

Roll_regression_map <- function(Manager_list, BM_Monthly, Nsel = 36, Silent = F){
  
  # Again, for stepping through the function this is useful:
  # Manager <- Manager_list[[2]]
  Manager <- Manager_list

  # THIS IS AN IMPORTANT STEP:::: using NULL
  if(nrow(Manager) < Nsel) return(NULL)
  
  if(!Silent) message(glue::glue("Starting: {unique(Manager$FundName)}"))
    
  # Identify the rolling dates:
  Alldates <- Manager %>% select(date) %>% unique %>% arrange(date)

# Let's do Rolling 36 month returns:

Roll_Dates <- Alldates %>% pull(date) %>% fmxdat::Date_Roller(., N = Nsel)

# Now create a function to map across all dates for this manager:
dfreg <- left_join(Manager, BM_Monthly %>% select(date, BM_Monthly) %>% unique, by = "date")

Map_roller_regression <- function(dfreg, Roll_Dates){
  # Dsel <- Roll_Dates[[41]]
  Dsel <- Roll_Dates
  dfreg <- dfreg %>% filter(date %in% Dsel)
  
if( nrow(dfreg) != Nsel) stop(glue::glue("Dates for {unique(df_select$FundName)} not equal to 36 at start date: {Dsel[1]}"))
# Regression spec:
lmreg <- lm(Returns ~ BM_Monthly, data = dfreg)

# Pull Beta and create a table for the manager:
Output <- 
  lmreg %>% broom::tidy() %>% select(term, coef = estimate) %>% 
  mutate(term = ifelse(grepl("Intercept", term), "Alpha",
                       ifelse(grepl("BM_M", term), "Beta", NA))) %>% 
  mutate(Manager = unique(dfreg$FundName), date = last(Dsel)) %>% spread(term, coef)

Output

}

result_pm <- 
Roll_Dates %>% map_df(~Map_roller_regression(dfreg, .))

result_pm
  
}

#----------------------------------------------------
# Note this will take some time to run...
#----------------------------------------------------

Rolling_Beta <- 
  Manager_list %>% map_df(~Roll_regression_map(., BM_Monthly, Nsel = 36, Silent = F))

Lastly, let’s plot the above in a jitter plot / bar plot to illustrate just how high betas have been, on average, for active managers over the past 10 years.

Note - I will use fmxdat::safe_month_min below to go back ten years safely.

…otherwise, if a month ends on 28 Feb and ten years ago was 29 Feb, it will only go back to 29 Feb, meaning you miss the Feb month…

#----------------------------------------------------
# Suppose we only want to look at funds with the full history as well:
#----------------------------------------------------

jitter_plotter_of_betas <- function(Rolling_Beta, Months_back = 120){
  
  avg_beta <- 
    Rolling_Beta %>% 
    filter(date >= fmxdat::safe_month_min(last(date), N = Months_back)) %>% 
    group_by(Manager) %>% 
    filter(n() >= Months_back) %>% 
    summarise(Beta = mean(Beta, na.rm=T))
  
  FD <- 
    Rolling_Beta %>% 
    filter(date >= fmxdat::safe_month_min(last(date), N = Months_back)) %>% 
    filter(date == last(date)) %>% pull(date) %>% 
    unique()
  
gsav <- 
  avg_beta %>% 
  ggplot() + 
    geom_boxplot(aes("Beta", Beta), alpha = 0.5, fill = '#0f70ee') + 
    # geom_point(aes("Beta", Beta)) + 
    geom_jitter(aes("Beta", Beta), size = 3) + 
    # geom_jitter(aes("Beta", Beta, color = alphasign)) + 
  # This sets the axis boundaries explicitly  
  coord_cartesian(ylim = c(0,max(avg_beta$Beta))) + 
  
  labs(title = glue::glue("Active Manager Rolling 3 Year Beta Averages since {format(FD, '%B %Y')}"), 
                                             subtitle = "Beta wrt. FTSE/JSE Capped SWIX using rolling 3 year regression",
                                             x = "", y = "ASISA SA Active Equity Managers") + 
    scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::pretty_breaks(10)) + 
  fmxdat::theme_fmx()
  
# To save the plot:
# ggsave(filename = glue::glue('Beta_Boxplot.png'), plot = gsav, device = 'png', width = 12, height = 6)
  
}

Final Example

Let’s use another example to hit home the S-A-C paradigm.

# Step 1: Define our lists:
genres <- fmxdat::spotify %>% filter(!is.na(genre)) %>% group_split(genre)

genre_wrangles <- function(genres, TopQtiles = 0.8, BotQtiles = 0.2){
  # genre_sel <- genres[[1]]
  genre_sel <- genres

  result <-   
  genre_sel %>% summarise(Songs = n(),
                          
                          energy_high = quantile(energy, TopQtiles),
                          energy_low = quantile(energy, BotQtiles),
                          
                          dance_high = quantile(danceability , TopQtiles),
                          dance_low = quantile(danceability , BotQtiles),
                          
                          loudness_high = quantile(loudness , TopQtiles),
                          loudness_low = quantile(loudness , BotQtiles)
                          
  ) %>% mutate(Genre = unique(genre_sel$genre)) %>% 
    gather(which, value, contains(c("low", "high")))
                          
}

Result <- 
genres %>% map_df(~genre_wrangles(., TopQtiles = 0.8, BotQtiles = 0.2))

#----------------------------------------------------
# Let's now create a plot of each category using a mapping function on a plot.
#----------------------------------------------------

cats <- Result %>% group_split(which

# Create folder for plots in home directory:
rmsfuns::build_path("Plots")


# Mapper function:

Plotting_music_catogories <- function(cats){
  
  # catsel <- cats[[1]]
  catsel <- cats
  
library(colorspace)

# Generate 30 pastel hues
pastel_cols <- qualitative_hcl(30, palette = "Pastel")
  
message(glue::glue("Starting to plot: {unique(catsel$which)}"))

gsav <- 
  catsel %>% 
    ggplot() + 
    # use 'reorder' below to order by value...
    geom_bar(aes( value, reorder(Genre, value), fill = Genre), stat = "identity", position = "dodge") + 
    # We have the guides on the y-axis...
    guides(fill = "none") + 
    # Labels
    labs(x = "Score", y = "Genre", 
         # Note below - I use 'toupper' to make all uppercase.
         title = glue::glue("Category Comparison: {toupper(gsub('_', ' ', unique(catsel$which)))}"), 
         # Note the conditional statement below:
         subtitle = glue::glue("This shows the {ifelse(grepl('_high', unique(catsel$which)), 'top', 'bottom')} quartile level per genre"), caption = "Source: Spotify") +
    # Themes:
    fmxdat::theme_fmx(title.size = ggpts(40), 
                      subtitle.size = ggpts(38),
                      CustomCaption = T)

# Now sav:
    
    ggsave(filename = glue::glue('Plots/{unique(catsel$which)}.png'), plot = gsav, 
           device = 'png', width = 12, height = 6)
}

# Let's now create plots per category...
cats %>% map(~Plotting_music_catogories(.))

# You can do the above quietly, ie without warnings using walk, not map - comes down to the same thing...
cats %>% walk(~Plotting_music_catogories(.))