Overview of Presentation Contents

This presentation introduces essential techniques for analyzing date and time data in health research using R. Key topics include:

Importance of date/time analysis
Core R packages for handling dates (e.g., lubridate, tidyverse)
Date conversion and formatting
Calculating durations and age
Extracting date components (e.g., month, weekday, season)
Summary and Visualize time-based trends

1. Importance of Date/Time Analysis in Health Data

Date-time data is crucial in healthcare because it allows us to:

Calculate accurate patient ages, service time
Measure time intervals between critical events (e.g., diagnosis to treatment, age vaccination)
Identify seasonal patterns in disease occurrence
Track vaccine effectiveness over time
Determine follow-up periods for clinical studies

Load required libraries

library(lubridate) # Simplifies the manipulation of dates and times in R (e.g., formatting, extracting components)
library(dplyr) # Provides tools for data manipulation, helpful for filtering and summarizing date-based data
library(stringr) # Simplifies string manipulation

library(zoo) # For working with time series data, including rolling calculations and handling missing data in time series
library(tsibble) # Provides tools for handling time series data, including date-time indexes and features for forecasting

library(readr) # For reading date-time data from CSV
library(haven) #  data from other statistical software formats (SPSS, SAS, Stata)
library(forecast) # Useful for time series forecasting, especially when working with seasonal or trend-based data
library(ggplot2) # For visualizing date-time trends in data (e.g., time series plots)
library(janitor) # Tabulation, cleaning column names, adding totals and proportions
library (plotly)     # Interactive visualizations

# Documentation/reporting
library(prettydoc)  # Pretty document templates
library(flexdashboard)# Interactive dashboards
library(knitr)      # Dynamic report generation
library(quarto)  # For rendering and publishing documents with the Quarto framework
library(rmarkdown)  # R Markdown document processing
library(yaml)       # YAML document processing

# Tabulation
library(flextable)  # Flexible table formatting
library(gt)         # Grammar of tables
library(reactable)  # Interactive tables

setwd("C:/Users/User/Desktop/Materials_ Course and proposals/Course Related/DataCamp/Data/Synthetic_data")

patients <- read.csv("patients.csv")
vaccination <- read.csv("immunizations.csv")

# Convert column names to lowercase
colnames(patients) <- tolower(colnames(patients))
colnames(vaccination) <- tolower(colnames(vaccination))

# View data
# head(patients)
# head(vaccination)

str(patients)

'data.frame':   106 obs. of  28 variables:
 $ id                 : chr  "30a6452c-4297-a1ac-977a-6a23237c7b46" "34a4dcc4-35fb-6ad5-ab98-be285c586a4f" "7179458e-d6e3-c723-2530-d4acfe1c2668" "37c177ea-4398-fb7a-29fa-70eb3d673876" ...
 $ birthdate          : chr  "1994-02-06" "1968-08-06" "2008-12-21" "1994-01-27" ...
 $ deathdate          : chr  "" "2009-12-11" "" "" ...
 $ ssn                : chr  "999-52-8591" "999-75-3953" "999-70-1925" "999-27-9779" ...
 $ drivers            : chr  "S99996852" "S99993577" "" "S99995100" ...
 $ passport           : chr  "X47758697X" "X28173268X" "" "X83694889X" ...
 $ prefix             : chr  "Mr." "Mr." "" "Mrs." ...
 $ first              : chr  "Joshua658" "Bennie663" "Hunter736" "Carlyn477" ...
 $ middle             : chr  "Alvin56" "" "Mckinley734" "Florencia449" ...
 $ last               : chr  "Kunde533" "Ebert178" "Gerlach374" "Williamson769" ...
 $ suffix             : logi  NA NA NA NA NA NA ...
 $ maiden             : chr  "" "" "" "Rogahn59" ...
 $ marital            : chr  "M" "D" "" "M" ...
 $ race               : chr  "white" "white" "white" "asian" ...
 $ ethnicity          : chr  "nonhispanic" "nonhispanic" "nonhispanic" "nonhispanic" ...
 $ gender             : chr  "M" "M" "M" "F" ...
 $ birthplace         : chr  "Boston  Massachusetts  US" "Chicopee  Massachusetts  US" "Spencer  Massachusetts  US" "Franklin  Massachusetts  US" ...
 $ address            : chr  "811 Kihn Viaduct" "975 Pfannerstill Throughway" "548 Heller Lane" "160 Fadel Crossroad Apt 65" ...
 $ city               : chr  "Braintree" "Braintree" "Mattapoisett" "Wareham" ...
 $ state              : chr  "Massachusetts" "Massachusetts" "Massachusetts" "Massachusetts" ...
 $ county             : chr  "Norfolk County" "Norfolk County" "Plymouth County" "Plymouth County" ...
 $ fips               : int  25021 25021 NA NA NA 25017 25017 25021 25009 NA ...
 $ zip                : int  2184 2184 0 0 0 2138 2476 2184 1835 0 ...
 $ lat                : num  42.2 42.3 41.6 41.8 42.7 ...
 $ lon                : num  -71 -71 -70.9 -70.7 -71 ...
 $ healthcare_expenses: num  56905 124024 45645 12895 18500 ...
 $ healthcare_coverage: num  18020 1075 6155 659952 5494 ...
 $ income             : int  100511 49737 133816 17382 52159 75767 58294 49737 77756 35255 ...

str(vaccination)

'data.frame':   1619 obs. of  6 variables:
 $ date       : chr  "2016-04-10T09:04:48Z" "2016-04-10T09:04:48Z" "2016-04-10T09:04:48Z" "2016-04-10T09:04:48Z" ...
 $ patient    : chr  "30a6452c-4297-a1ac-977a-6a23237c7b46" "30a6452c-4297-a1ac-977a-6a23237c7b46" "30a6452c-4297-a1ac-977a-6a23237c7b46" "30a6452c-4297-a1ac-977a-6a23237c7b46" ...
 $ encounter  : chr  "0b03e41b-06a6-66fa-b972-acc5a83b134a" "0b03e41b-06a6-66fa-b972-acc5a83b134a" "0b03e41b-06a6-66fa-b972-acc5a83b134a" "0b03e41b-06a6-66fa-b972-acc5a83b134a" ...
 $ code       : int  140 113 43 114 140 140 140 140 140 52 ...
 $ description: chr  "Influenza  seasonal  injectable  preservative free" "Td (adult)  5 Lf tetanus toxoid  preservative free  adsorbed" "Hep B  adult" "meningococcal MCV4P" ...
 $ base_cost  : num  136 136 136 136 136 136 136 136 136 136 ...

# Check for missing IDs
patients$id[patients$id == ""] <- NA
print(sum(is.na(patients$id)))

[1] 0

# Check duplicates
patients <- patients %>%
  arrange(id) %>%
  group_by(id) %>%
  mutate(dup = row_number()) %>%
  ungroup()

table(patients$dup)  # Check if any duplicates exist


  1 
106

# Merge with vaccination data
vaccination <- vaccination %>% mutate(id = patient)


vacc_pt_merged <- left_join(patients, vaccination, by = "id" )

#vacc_pt_merged <- left_join(patients, vaccination, by = c("id" = "patient"))

str (vacc_pt_merged)

Converting Character to Date Format

in the dataset birthdate, deathdate, and date (vaccination date) are all stored as character strings. I need to convert these to proper date formats using lubridate::ymd().

# Convert to date format
str (vacc_pt_merged$birthdate)

 chr [1:1619] "1975-12-24" "1975-12-24" "1975-12-24" "1975-12-24" ...

str (vacc_pt_merged$date) # date is YYYY-MM-DD HH:MM:SS format.

 chr [1:1619] "2015-06-24T14:05:28Z" "2016-06-29T14:05:28Z" ...

vacc_pt_merged <- vacc_pt_merged %>%
  mutate(
    birthdate = parse_date(birthdate, format = "%Y-%m-%d"),
    vacc_date = as.Date(strptime(date, format = "%Y-%m-%dT%H:%M:%SZ"))
  )

# Check structure
str (vacc_pt_merged$birthdate)

 Date[1:1619], format: "1975-12-24" "1975-12-24" "1975-12-24" "1975-12-24" "1975-12-24" ...

str (vacc_pt_merged$vacc_date)

 Date[1:1619], format: "2015-06-24" "2016-06-29" "2016-06-29" "2017-07-05" "2017-07-05" ...

# Check converted formats via glimpse
# glimpse(vacc_pt_merged)

# no missing values on dates
sum(is.na(vacc_pt_merged$birthdate))

[1] 0

sum(is.na(vacc_pt_merged$vacc_date))

[1] 0

# Check date ranges
summary(vacc_pt_merged$birthdate)

        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"1914-03-03" "1969-12-09" "1999-02-24" "1990-12-03" "2012-09-02" "2023-03-01"

summary(vacc_pt_merged$vacc_date)

        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"1962-12-09" "2017-05-21" "2020-01-04" "2019-04-29" "2022-01-26" "2024-10-26"

Now date columns are in Date format (YYYY-MM-DD), ready for analysis!

Age and Service year Calculation

Age at the time of vaccination is essential. Here’s how I calculate it in years.

vacc_pt_merged <- vacc_pt_merged %>%
  mutate(
    age_years = floor (as.numeric(difftime(Sys.Date(), birthdate, units = "days")) / 365.25),
    
    # Calculate vaccination service year by extracting the year from vacc_date
   vacc_service_year = year(vacc_date)
  )


summary(vacc_pt_merged$age_years)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    2.0    13.0    26.0    34.4    55.0   111.0

summary(vacc_pt_merged$vacc_service_year)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1962    2017    2020    2019    2022    2024

# tabulation
table(vacc_pt_merged$age_years)


  2   3   6   7   8  10  11  12  13  14  15  16  17  18  19  20  21  22  24  25 
 46  24  64  66  34  72  30  38  59  21  21  51  66  18  36  18  69  13  13  12 
 26  28  29  30  31  34  37  38  39  41  42  44  45  46  47  48  49  50  52  53 
 47  17  30   9  38  11   6   8   6  27  46   9  17  15  13  18  42  15  13  30 
 54  55  56  57  59  61  62  63  65  66  69  70  72  73  75  77  79  81  86  98 
 16  15  15  30  14  26  13  62  26  14  14  57  15  27  15  14  12  13  12  20 
111 
  1

table(vacc_pt_merged$vacc_service_year)


1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 2001 2002 2003 2004 2005 2006 
   1    3    1    1    1    2    1    1    1    1    1    2    1    1    2    1 
2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 
   3    6    2    3    2    4    3   22  153  139  137  151  161  137  263  120 
2023 2024 
 170  122

As shown above, the average age at vaccination was approximately 34.4 years, with a range from 2 to 111 years.

The highest frequency of vaccinations occurred in the year 2021, suggesting a peak in service delivery during that period.

Calculating Time Differences

To calculate time differences, I use interval() or simple subtraction between date objects.

Example: Years between birth and vaccination gives the number of years from birth to vaccination.

vacc_pt_merged <- vacc_pt_merged %>%
  mutate(
    days_to_vax = as.numeric(vacc_date - birthdate),  # Difference in days
    weeks_to_vax = as.numeric(difftime(vacc_date,  birthdate,  units = "weeks")),  # Difference in weeks
    months_to_vax = as.numeric(difftime(vacc_date,  birthdate,  units = "days")) / 30.44,  # Approximate months (average days per month)
    years_to_vax = floor(as.numeric(difftime(vacc_date,  birthdate,  units = "days")) / 365.25)  # Approximate years (including leap years)
  )


# View summary statistics 
summary(vacc_pt_merged$days_to_vax)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0    2548    7742   10375   18263   36813

summary(vacc_pt_merged$weeks_to_vax)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0     364    1106    1482    2609    5259

summary(vacc_pt_merged$months_to_vax)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   83.71  254.34  340.82  599.97 1209.36

summary(vacc_pt_merged$years_to_vax)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    6.00   21.00   28.02   50.00  100.00

Extracting Date Components

We will perform the following extractions for both the birthdate and vaccination date:

Day: The day of the month is extracted using the day() function.
Month: The month of the date is obtained with the month() function. This provides a numeric value (1 for January, 12 for December).
Year: The year component is retrieved using the year() function, returning a four-digit value for the year.
Quarter: The quarter of the year is extracted using the quarter() function, providing a numeric value from 1 to 4, representing the first through fourth quarter of the year.
Season: Based on the month, we assign a corresponding season (Summer, Autumn, Winter, or Spring) using a conditional case_when() function:

These components are useful for segmenting the data, performing seasonal analysis, and understanding how different periods influence the patterns observed in the data. For instance, analyzing vaccination trends by quarter or season can provide valuable insights into public health strategies.

By extracting these components, we are preparing the data for more granular analysis, including cohort analysis, seasonal trend assessments, and other forms of time-based grouping.

vacc_pt_merged <- vacc_pt_merged %>%
  mutate(
    # Basic date components
    vacc_day = day(vacc_date),
    vacc_month = month(vacc_date),
    vacc_month_name = month(vacc_date, label = TRUE, abbr = FALSE),  # Full month name
    vacc_year = year(vacc_date),
    vacc_weekday = wday(vacc_date, label = TRUE, abbr = FALSE),     # Full weekday name
    vacc_week = week(vacc_date),                                    # Week of year
    vacc_quarter = quarter(vacc_date),
    
    # Fiscal year (assuming June start)
    vacc_fy = ifelse(vacc_month >= 6, vacc_year, vacc_year - 1),
    
    # Season (Northern Hemisphere)
    vacc_season = case_when(
      vacc_month %in% 3:5 ~ "Spring",
      vacc_month %in% 6:8 ~ "Summer",
      vacc_month %in% 9:11 ~ "Autumn",
      TRUE ~ "Winter"  # December-February
    ),
    
    # Weekend flag
    is_weekend = wday(vacc_date) %in% c(1, 7),  # 1=Sunday, 7=Saturday
    
    # Days since specific reference date (e.g., pandemic start)
    days_since_ref = floor(as.numeric(vacc_date - as.Date("2020-01-01")))
  )

Filtering Between Date Ranges

Filtering data based on specific date ranges is a common operation in data analysis, particularly when focusing on subsets of data that fall within particular periods. In R, the dplyr package provides an intuitive way to filter rows within a specific date range.

Below is an example of how to filter rows based on dates that fall between two specified dates using the filter() function:

# Filter only vaccinations in 2022
vacc_2022 <- vacc_pt_merged %>%
  filter(vacc_date >= ymd("2022-01-01") & vacc_date <= ymd("2022-12-31"))

# Filter vaccinations between 2015 and 2024
vacc_1524 <- vacc_pt_merged %>%
  filter(vacc_service_year >= 2015 & vacc_service_year <= 2024)
         

table(vacc_1524$vacc_year)


2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 
 153  139  137  151  161  137  263  120  170  122

table(vacc_1524$race)


 asian  black native  other  white 
   111     94     13     29   1306

# table(vacc_1524$ethnicity)

# ordering variables for visual checking 
vacc_1524 <- vacc_1524 %>% 
  select(id, gender, race, birthdate, date, vacc_date, age_years, vacc_service_year, days_to_vax, weeks_to_vax, 
         months_to_vax, years_to_vax, vacc_day, vacc_month, vacc_month_name, vacc_year, vacc_weekday, vacc_week, 
         vacc_quarter, vacc_fy, vacc_season, is_weekend, days_since_ref, everything())

Summary of vaccination pattern: Tabulation ———————————–

# Recode gender
vacc_1524 <- vacc_1524 %>%
  mutate(gender = case_when(
    gender == "F" ~ "Female",
    gender == "M" ~ "Male",
    TRUE ~ gender
  ))

# Convert time variables to character for consistency
vacc_1524 <- vacc_1524 %>%
  mutate(across(c(vacc_service_year, vacc_quarter, vacc_weekday), as.character))

# Create and label each table
tab_year <- vacc_1524 %>%
  tabyl(vacc_service_year, gender) %>%
  adorn_totals("row") %>%
  mutate(Category = "Year", .before = 1) %>%
  rename(Label = vacc_service_year)

tab_quarter <- vacc_1524 %>%
  tabyl(vacc_quarter, gender) %>%
  adorn_totals("row") %>%
  mutate(Category = "Quarter", .before = 1) %>%
  rename(Label = vacc_quarter)

tab_weekday <- vacc_1524 %>%
  tabyl(vacc_weekday, gender) %>%
  adorn_totals("row") %>%
  mutate(Category = "Weekday", .before = 1) %>%
  rename(Label = vacc_weekday)

# Combine all
tab_combined <- bind_rows(tab_year, tab_quarter, tab_weekday)

# Calculate column proportions within each Category group (excluding "Total" row)
tab_combined <- tab_combined %>%
  group_by(Category) %>%
  mutate(
    Female_Prop = round(100 * Female / sum(Female[Label != "Total"]), 1),
    Male_Prop = round(100 * Male / sum(Male[Label != "Total"]), 1)
  ) %>%
  ungroup()

# Format with flextable
ft <- flextable(tab_combined) %>%
  set_header_labels(
    Category = "Time Unit",
    Label = "Category",
    Female = "Female Count",
    Male = "Male Count",
    Female_Prop = "Female %",
    Male_Prop = "Male %"
  ) %>%
  add_header_lines(values = "Table: Vaccination counts and gender proportions (%) across Year, Quarter, and Weekday") %>%
  theme_box() %>%
  autofit()

ft

Table: Vaccination counts and gender proportions (%) across Year, Quarter, and Weekday
Time Unit	Category	Female Count	Male Count	Female %	Male %
Year	2015	74	79	9.3	10.5
Year	2016	77	62	9.6	8.2
Year	2017	65	72	8.1	9.5
Year	2018	97	54	12.2	7.2
Year	2019	78	83	9.8	11.0
Year	2020	61	76	7.6	10.1
Year	2021	146	117	18.3	15.5
Year	2022	61	59	7.6	7.8
Year	2023	82	88	10.3	11.7
Year	2024	57	65	7.1	8.6
Year	Total	798	755	100.0	100.0
Quarter	1	174	229	21.8	30.3
Quarter	2	230	146	28.8	19.3
Quarter	3	205	262	25.7	34.7
Quarter	4	189	118	23.7	15.6
Quarter	Total	798	755	100.0	100.0
Weekday	Friday	69	111	8.6	14.7
Weekday	Monday	117	60	14.7	7.9
Weekday	Saturday	171	82	21.4	10.9
Weekday	Sunday	153	124	19.2	16.4
Weekday	Thursday	118	135	14.8	17.9
Weekday	Tuesday	79	86	9.9	11.4
Weekday	Wednesday	91	157	11.4	20.8
Weekday	Total	798	755	100.0	100.0

📝 Summary of Vaccination Uptake by Gender and Time Period

The table Aabove presents vaccination counts and gender proportions across different time units—year, quarter, and weekday—for individuals aged 15–24 years.

By Year

2021 recorded the highest vaccination uptake for both females (146; 18.3%) and males (117; 15.5%), indicating a peak in vaccine distribution during that year.
The lowest uptake was observed in 2017 for females (65; 8.1%) and in 2016 for males (62; 8.2%).

Overall, vaccination was more evenly distributed across years, but there was a noticeable increase in 2021, likely due to public health initiatives during the COVID-19 pandemic.

By Quarter
For females, the highest uptake was in Quarter 2 (230; 28.8%), followed by Quarter 3 (205; 25.7%).
For males, Quarter 3 (262; 34.7%) had the highest uptake, while Quarter 2 (146; 19.3%) was lower in comparison.

Quarter 1 showed the lowest female uptake proportionally (21.8%), while Quarter 4 had the lowest for males (15.6%).

By Weekday

Wednesday (157; 20.8%) was the peak for males.
Friday (14.7%) and Thursday (17.9%) were more popular among males.
Monday had the lowest male uptake (7.9%), while Tuesday (9.9%) was the lowest for females.

📊 Visualizing Vaccination Trends

Time-based visualizations help us uncover trends, seasonality, and variation in vaccination data. Below are some examples of how to visualize date information using the vacc_pt_merged dataset. These plots support evidence-based insights and can guide public health decision-making.

Overall Vaccination Patterns by by Gender and Race

2021 was a notable year across most racial groups, with peaks in vaccination uptake, likely due to intensified COVID-19 vaccine rollouts.
The “Other” racial group showed the most fluctuation with a sharp spike in 2023.
The White group had the most consistent trend, while Asian and Black groups showed spikes at specific periods.
Females showed a sharper peak than males in 2021 but also had a steeper decline afterward.
Males had more consistent trends overall, especially from 2015–2020 and 2022–2024.
The post-2021 decline in both groups may suggest reduced urgency or campaign intensity in recent years
Post-2021, a general decline across all groups suggests possible vaccine fatigue or the end of mass campaign periods.
Variations of vaccination uptake by gender and race are evident, suggesting differing levels of access, trust, or outreach effectiveness across communities.

Monthly Vaccination Trend in each Year

Use a line plot to visualize how the number of vaccinations changes over time.

The time series reveals longitudinal patterns, showing potential campaign effects (peaks) or service interruptions (troughs). The upward trend in mid 2021 suggests successful program for vaccination.

As shown in the monthly vaccination pattern,

A bi-modal pattern is evident with two peaks (March and August) and two noticeable dips (June and November).
March and August are peak months — possibly due to public health campaigns.
November was the lowest — possibly due to holiday distractions, reduced outreach, or end-of-year slowdowns.

vacc_1524 %>%
  count(vacc_year, vacc_month) %>%
  ggplot(aes(x = as.Date(paste(vacc_year, vacc_month, "01", sep = "-")), 
             y = n)) +
  geom_line(color = "#1f78b4", linewidth = 1) +
  geom_point(color = "#33a02c") +
  labs(title = "Monthly Vaccination Trends",
       x = "Timeline", y = "Vaccinations") +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

vacc_1524 %>%
  mutate(month = month(vacc_date, label = TRUE, abbr = TRUE)) %>%  # Extract month from date
  group_by(month) %>%
  summarise(count = n(), .groups = "drop") %>%
  ggplot(aes(x = month, y = count, group = 1)) +
  
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 2, color = "steelblue") +
  labs(
    title = "Total Vaccinations by Month",
    x = "Month",
    y = "Vaccination Count"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )

Weekday Heatmap

The heatmap identifies temporal service patterns. Higher intensity on Wednesdays/Thursdays may reflect clinic scheduling norms, while weekend gaps could indicate access barriers.

vacc_1524 %>%
  count(vacc_month_name, vacc_weekday) %>%
  ggplot(aes(x = vacc_weekday, y = vacc_month_name, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "#f7fbff", high = "#08306b") +
  labs(title = "Vaccination Intensity by Weekday and Month",
       x = "Weekday", y = "Month") +
  theme_minimal() +
  theme(legend.position = "bottom")

Heatmap of vaccinations by weekday and month

Seasonal Vaccination Distribution

Explore how vaccination services vary by season.

vacc_1524 %>%
  mutate(month = as.numeric(format(vacc_date, "%m")),
         season = case_when(
           month %in% c(12, 1, 2) ~ "Summer",
           month %in% c(3, 4, 5) ~ "Autumn",
           month %in% c(6, 7, 8) ~ "Winter",
           month %in% c(9, 10, 11) ~ "Spring"
         )) %>%
  count(season) %>%
  ggplot(aes(x = season, y = n, fill = season)) +
  geom_col() +
  labs(
    title = "Seasonal Vaccination Distribution",
    x = "Season",
    y = "Vaccination Count"
  ) +
  theme_minimal()

Vaccination proportion by year and race

If we’d like a stacked bar plot of the proportions

# Proportion by vacc_year and race with cleaned race labels
df_plot_race <- vacc_1524 %>%
  mutate(
    race = ifelse(race %in% c("native", "other"), "Other", race),
    race = str_to_title(race),
    vacc_year = as.integer(floor(as.numeric(vacc_year)))  # Ensure whole number
  ) %>%
  group_by(race, vacc_year) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(race) %>%
  mutate(
    total = sum(count),
    proportion = (count / total) * 100
  ) %>%
  ungroup()

# Line plot
ggplot(df_plot_race, aes(x = vacc_year, y = proportion, color = race)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = seq(min(df_plot_race$vacc_year), max(df_plot_race$vacc_year), by = 2)) +
  labs(
    title = "Vaccination Proportion by Race and Year",
    x = "Vaccination Year",
    y = "Proportion (%)",
    color = "Race"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5)
  )

# Bar plot
ggplot(df_plot_race, aes(x = vacc_year, y = proportion, fill = race)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_x_continuous(breaks = seq(min(df_plot_race$vacc_year), max(df_plot_race$vacc_year), by = 2)) +
  labs(
    title = "Vaccination Proportion by Race and Year",
    x = "Vaccination Year",
    y = "Proportion (%)",
    fill = "Race"
  ) +
  theme_minimal()

# Proportion by vacc_year and gender
vacc_1524 %>%
  mutate(
    gender = case_when(
      gender == "M" ~ "Male",
      gender == "F" ~ "Female",
      TRUE ~ gender
    ),
    vacc_year = as.integer(vacc_year)  # Ensure year is whole number
  ) %>%
  group_by(gender, vacc_year) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(gender) %>%
  mutate(
    total = sum(count),
    proportion = (count / total) * 100
  ) %>%
  ungroup() %>% 
  
  ggplot(aes(x = vacc_year, y = proportion, color = gender)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = scales::pretty_breaks()) +  # Ensures whole number years
  labs(
    title = "Vaccination Proportion by Gender and Year",
    x = "Vaccination Year",
    y = "Proportion (%)",
    color = "Gender"
  ) +
  scale_color_manual(values = c("Male" = "#1f77b4", "Female" = "#ff7f0e")) +
  theme_minimal() +
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12)
  )

Vaccination proportion by month and year

Summary and Key Takeaways

Key techniques covered:

Proper date conversion from character strings
Accurate age and time difference calculations
Seasonal and temporal pattern analysis
Visualization of date-based trends
Survival analysis for time-to-event data

Best practices:

Always validate date conversions
Document time units clearly (days vs. months vs. years)
Handle missing dates explicitly
Consider time zones for multi-center studies

References

https://CRAN.R-project.org/package=dplyr
https://www.jstatsoft.org/article/view/v103i01
https://r4ds.hadley.nz/