# load needed libraries
library(tidyverse)
library(mdsr)

# the raw text
Macbeth_raw

macbeth <- Macbeth_raw |>
  str_split("\r\n") |>
  pluck(1)

# view how many lines there are
length(macbeth)

# take a look at a small excerpt
macbeth[300:310]

# get Macbeth's lines
macbeth_lines <- macbeth |>
  str_subset("  MACBETH")
length(macbeth_lines)
head(macbeth_lines)

# view Macduff's lines
macbeth |>
  str_subset("  MACDUFF")

# create a new data frame with the character names
# and a regex, then create a list for each character
# showing what line locations they speak
macbeth_chars <- tribble(
  ~name, ~regexp,
  "Macbeth", "  MACBETH\\.",
  "Lady Macbeth", "  LADY MACBETH\\.",
  "Banquo", "  BANQUO\\.",
  "Duncan", "  DUNCAN\\."
) |>
  mutate(speaks = map(regexp ,str_detect, string = macbeth))

# compute the speaker frequencies
speaker_freq <- macbeth_chars |>
  unnest(cols = speaks) |>
  mutate(
    line = rep(1:length(macbeth), 4),
    speaks = as.numeric(speaks)
  ) |>
  filter(line > 218 & line < 3172)

# make a data frame of the different acts of the play
acts <- tibble(
  line = str_which(macbeth, "^ACT [I|V]+"),
  line_text = str_subset(macbeth, "^ACT [I|V]+"),
  labels = str_extract(line_text, "^ACT [I|V]+")
)

# plot everything
ggplot(data = speaker_freq, aes(x = line, y = speaks)) +
  geom_smooth(
    aes(color = name),
    method = "loess",
    se = FALSE,
    span = 0.4
  ) + 
  geom_vline(
    data = acts, 
    aes(xintercept = line), 
    color = "darkgray", lty = 3
  ) + 
  geom_text(
    data = acts, 
    aes(y = 0.085, label = labels), 
    hjust = "left", color = "darkgray"
  ) + 
  ylim(c(0, NA)) + 
  xlab("Line Number") +
  ylab("Proportion of Speeches") + 
  scale_color_brewer(palette = "Set2")