Overview of Presentation Contents
This presentation introduces essential techniques for analyzing date and time data in health research using R. Key topics include:
Importance of date/time analysis
Core R packages for handling dates (e.g.,
lubridate,tidyverse)Date conversion and formatting
Calculating durations and age
Extracting date components (e.g., month, weekday, season)
Summary and Visualize time-based trends
1. Importance of Date/Time Analysis in Health Data
Date-time data is crucial in healthcare because it allows us to:
Calculate accurate patient ages, service time
Measure time intervals between critical events (e.g., diagnosis to treatment, age vaccination)
Identify seasonal patterns in disease occurrence
Track vaccine effectiveness over time
Determine follow-up periods for clinical studies
Load required libraries
library(lubridate) # Simplifies the manipulation of dates and times in R (e.g., formatting, extracting components)
library(dplyr) # Provides tools for data manipulation, helpful for filtering and summarizing date-based data
library(stringr) # Simplifies string manipulation
library(zoo) # For working with time series data, including rolling calculations and handling missing data in time series
library(tsibble) # Provides tools for handling time series data, including date-time indexes and features for forecasting
library(readr) # For reading date-time data from CSV
library(haven) # data from other statistical software formats (SPSS, SAS, Stata)
library(forecast) # Useful for time series forecasting, especially when working with seasonal or trend-based data
library(ggplot2) # For visualizing date-time trends in data (e.g., time series plots)
library(janitor) # Tabulation, cleaning column names, adding totals and proportions
library (plotly) # Interactive visualizations
# Documentation/reporting
library(prettydoc) # Pretty document templates
library(flexdashboard)# Interactive dashboards
library(knitr) # Dynamic report generation
library(quarto) # For rendering and publishing documents with the Quarto framework
library(rmarkdown) # R Markdown document processing
library(yaml) # YAML document processing
# Tabulation
library(flextable) # Flexible table formatting
library(gt) # Grammar of tables
library(reactable) # Interactive tablessetwd("C:/Users/User/Desktop/Materials_ Course and proposals/Course Related/DataCamp/Data/Synthetic_data")
patients <- read.csv("patients.csv")
vaccination <- read.csv("immunizations.csv")# Convert column names to lowercase
colnames(patients) <- tolower(colnames(patients))
colnames(vaccination) <- tolower(colnames(vaccination))
# View data
# head(patients)
# head(vaccination)
str(patients)'data.frame': 106 obs. of 28 variables:
$ id : chr "30a6452c-4297-a1ac-977a-6a23237c7b46" "34a4dcc4-35fb-6ad5-ab98-be285c586a4f" "7179458e-d6e3-c723-2530-d4acfe1c2668" "37c177ea-4398-fb7a-29fa-70eb3d673876" ...
$ birthdate : chr "1994-02-06" "1968-08-06" "2008-12-21" "1994-01-27" ...
$ deathdate : chr "" "2009-12-11" "" "" ...
$ ssn : chr "999-52-8591" "999-75-3953" "999-70-1925" "999-27-9779" ...
$ drivers : chr "S99996852" "S99993577" "" "S99995100" ...
$ passport : chr "X47758697X" "X28173268X" "" "X83694889X" ...
$ prefix : chr "Mr." "Mr." "" "Mrs." ...
$ first : chr "Joshua658" "Bennie663" "Hunter736" "Carlyn477" ...
$ middle : chr "Alvin56" "" "Mckinley734" "Florencia449" ...
$ last : chr "Kunde533" "Ebert178" "Gerlach374" "Williamson769" ...
$ suffix : logi NA NA NA NA NA NA ...
$ maiden : chr "" "" "" "Rogahn59" ...
$ marital : chr "M" "D" "" "M" ...
$ race : chr "white" "white" "white" "asian" ...
$ ethnicity : chr "nonhispanic" "nonhispanic" "nonhispanic" "nonhispanic" ...
$ gender : chr "M" "M" "M" "F" ...
$ birthplace : chr "Boston Massachusetts US" "Chicopee Massachusetts US" "Spencer Massachusetts US" "Franklin Massachusetts US" ...
$ address : chr "811 Kihn Viaduct" "975 Pfannerstill Throughway" "548 Heller Lane" "160 Fadel Crossroad Apt 65" ...
$ city : chr "Braintree" "Braintree" "Mattapoisett" "Wareham" ...
$ state : chr "Massachusetts" "Massachusetts" "Massachusetts" "Massachusetts" ...
$ county : chr "Norfolk County" "Norfolk County" "Plymouth County" "Plymouth County" ...
$ fips : int 25021 25021 NA NA NA 25017 25017 25021 25009 NA ...
$ zip : int 2184 2184 0 0 0 2138 2476 2184 1835 0 ...
$ lat : num 42.2 42.3 41.6 41.8 42.7 ...
$ lon : num -71 -71 -70.9 -70.7 -71 ...
$ healthcare_expenses: num 56905 124024 45645 12895 18500 ...
$ healthcare_coverage: num 18020 1075 6155 659952 5494 ...
$ income : int 100511 49737 133816 17382 52159 75767 58294 49737 77756 35255 ...
str(vaccination)'data.frame': 1619 obs. of 6 variables:
$ date : chr "2016-04-10T09:04:48Z" "2016-04-10T09:04:48Z" "2016-04-10T09:04:48Z" "2016-04-10T09:04:48Z" ...
$ patient : chr "30a6452c-4297-a1ac-977a-6a23237c7b46" "30a6452c-4297-a1ac-977a-6a23237c7b46" "30a6452c-4297-a1ac-977a-6a23237c7b46" "30a6452c-4297-a1ac-977a-6a23237c7b46" ...
$ encounter : chr "0b03e41b-06a6-66fa-b972-acc5a83b134a" "0b03e41b-06a6-66fa-b972-acc5a83b134a" "0b03e41b-06a6-66fa-b972-acc5a83b134a" "0b03e41b-06a6-66fa-b972-acc5a83b134a" ...
$ code : int 140 113 43 114 140 140 140 140 140 52 ...
$ description: chr "Influenza seasonal injectable preservative free" "Td (adult) 5 Lf tetanus toxoid preservative free adsorbed" "Hep B adult" "meningococcal MCV4P" ...
$ base_cost : num 136 136 136 136 136 136 136 136 136 136 ...
# Check for missing IDs
patients$id[patients$id == ""] <- NA
print(sum(is.na(patients$id)))[1] 0
# Check duplicates
patients <- patients %>%
arrange(id) %>%
group_by(id) %>%
mutate(dup = row_number()) %>%
ungroup()
table(patients$dup) # Check if any duplicates exist
1
106
# Merge with vaccination data
vaccination <- vaccination %>% mutate(id = patient)
vacc_pt_merged <- left_join(patients, vaccination, by = "id" )
#vacc_pt_merged <- left_join(patients, vaccination, by = c("id" = "patient"))
str (vacc_pt_merged)Converting Character to Date Format
in the dataset birthdate, deathdate, and date (vaccination date) are all stored as character strings. I need to convert these to proper date formats using lubridate::ymd().
# Convert to date format
str (vacc_pt_merged$birthdate) chr [1:1619] "1975-12-24" "1975-12-24" "1975-12-24" "1975-12-24" ...
str (vacc_pt_merged$date) # date is YYYY-MM-DD HH:MM:SS format. chr [1:1619] "2015-06-24T14:05:28Z" "2016-06-29T14:05:28Z" ...
vacc_pt_merged <- vacc_pt_merged %>%
mutate(
birthdate = parse_date(birthdate, format = "%Y-%m-%d"),
vacc_date = as.Date(strptime(date, format = "%Y-%m-%dT%H:%M:%SZ"))
)
# Check structure
str (vacc_pt_merged$birthdate) Date[1:1619], format: "1975-12-24" "1975-12-24" "1975-12-24" "1975-12-24" "1975-12-24" ...
str (vacc_pt_merged$vacc_date) Date[1:1619], format: "2015-06-24" "2016-06-29" "2016-06-29" "2017-07-05" "2017-07-05" ...
# Check converted formats via glimpse
# glimpse(vacc_pt_merged)
# no missing values on dates
sum(is.na(vacc_pt_merged$birthdate)) [1] 0
sum(is.na(vacc_pt_merged$vacc_date))[1] 0
# Check date ranges
summary(vacc_pt_merged$birthdate) Min. 1st Qu. Median Mean 3rd Qu. Max.
"1914-03-03" "1969-12-09" "1999-02-24" "1990-12-03" "2012-09-02" "2023-03-01"
summary(vacc_pt_merged$vacc_date) Min. 1st Qu. Median Mean 3rd Qu. Max.
"1962-12-09" "2017-05-21" "2020-01-04" "2019-04-29" "2022-01-26" "2024-10-26"
Now date columns are in Date format (YYYY-MM-DD), ready for analysis!
Age and Service year Calculation
Age at the time of vaccination is essential. Here’s how I calculate it in years.
vacc_pt_merged <- vacc_pt_merged %>%
mutate(
age_years = floor (as.numeric(difftime(Sys.Date(), birthdate, units = "days")) / 365.25),
# Calculate vaccination service year by extracting the year from vacc_date
vacc_service_year = year(vacc_date)
)
summary(vacc_pt_merged$age_years) Min. 1st Qu. Median Mean 3rd Qu. Max.
2.0 13.0 26.0 34.4 55.0 111.0
summary(vacc_pt_merged$vacc_service_year) Min. 1st Qu. Median Mean 3rd Qu. Max.
1962 2017 2020 2019 2022 2024
# tabulation
table(vacc_pt_merged$age_years)
2 3 6 7 8 10 11 12 13 14 15 16 17 18 19 20 21 22 24 25
46 24 64 66 34 72 30 38 59 21 21 51 66 18 36 18 69 13 13 12
26 28 29 30 31 34 37 38 39 41 42 44 45 46 47 48 49 50 52 53
47 17 30 9 38 11 6 8 6 27 46 9 17 15 13 18 42 15 13 30
54 55 56 57 59 61 62 63 65 66 69 70 72 73 75 77 79 81 86 98
16 15 15 30 14 26 13 62 26 14 14 57 15 27 15 14 12 13 12 20
111
1
table(vacc_pt_merged$vacc_service_year)
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 2001 2002 2003 2004 2005 2006
1 3 1 1 1 2 1 1 1 1 1 2 1 1 2 1
2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
3 6 2 3 2 4 3 22 153 139 137 151 161 137 263 120
2023 2024
170 122
As shown above, the average age at vaccination was approximately 34.4 years, with a range from 2 to 111 years.
The highest frequency of vaccinations occurred in the year 2021, suggesting a peak in service delivery during that period.
Calculating Time Differences
To calculate time differences, I use interval() or simple subtraction between date objects.
Example: Years between birth and vaccination gives the number of years from birth to vaccination.
vacc_pt_merged <- vacc_pt_merged %>%
mutate(
days_to_vax = as.numeric(vacc_date - birthdate), # Difference in days
weeks_to_vax = as.numeric(difftime(vacc_date, birthdate, units = "weeks")), # Difference in weeks
months_to_vax = as.numeric(difftime(vacc_date, birthdate, units = "days")) / 30.44, # Approximate months (average days per month)
years_to_vax = floor(as.numeric(difftime(vacc_date, birthdate, units = "days")) / 365.25) # Approximate years (including leap years)
)
# View summary statistics
summary(vacc_pt_merged$days_to_vax) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 2548 7742 10375 18263 36813
summary(vacc_pt_merged$weeks_to_vax) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 364 1106 1482 2609 5259
summary(vacc_pt_merged$months_to_vax) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 83.71 254.34 340.82 599.97 1209.36
summary(vacc_pt_merged$years_to_vax) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 6.00 21.00 28.02 50.00 100.00
Extracting Date Components
We will perform the following extractions for both the birthdate and vaccination date:
Day: The day of the month is extracted using the
day()function.Month: The month of the date is obtained with the
month()function. This provides a numeric value (1 for January, 12 for December).Year: The year component is retrieved using the
year()function, returning a four-digit value for the year.Quarter: The quarter of the year is extracted using the
quarter()function, providing a numeric value from 1 to 4, representing the first through fourth quarter of the year.Season: Based on the month, we assign a corresponding season (Summer, Autumn, Winter, or Spring) using a conditional
case_when()function:
These components are useful for segmenting the data, performing seasonal analysis, and understanding how different periods influence the patterns observed in the data. For instance, analyzing vaccination trends by quarter or season can provide valuable insights into public health strategies.
By extracting these components, we are preparing the data for more granular analysis, including cohort analysis, seasonal trend assessments, and other forms of time-based grouping.
vacc_pt_merged <- vacc_pt_merged %>%
mutate(
# Basic date components
vacc_day = day(vacc_date),
vacc_month = month(vacc_date),
vacc_month_name = month(vacc_date, label = TRUE, abbr = FALSE), # Full month name
vacc_year = year(vacc_date),
vacc_weekday = wday(vacc_date, label = TRUE, abbr = FALSE), # Full weekday name
vacc_week = week(vacc_date), # Week of year
vacc_quarter = quarter(vacc_date),
# Fiscal year (assuming June start)
vacc_fy = ifelse(vacc_month >= 6, vacc_year, vacc_year - 1),
# Season (Northern Hemisphere)
vacc_season = case_when(
vacc_month %in% 3:5 ~ "Spring",
vacc_month %in% 6:8 ~ "Summer",
vacc_month %in% 9:11 ~ "Autumn",
TRUE ~ "Winter" # December-February
),
# Weekend flag
is_weekend = wday(vacc_date) %in% c(1, 7), # 1=Sunday, 7=Saturday
# Days since specific reference date (e.g., pandemic start)
days_since_ref = floor(as.numeric(vacc_date - as.Date("2020-01-01")))
)Filtering Between Date Ranges
Filtering data based on specific date ranges is a common operation in data analysis, particularly when focusing on subsets of data that fall within particular periods. In R, the dplyr package provides an intuitive way to filter rows within a specific date range.
Below is an example of how to filter rows based on dates that fall between two specified dates using the filter() function:
# Filter only vaccinations in 2022
vacc_2022 <- vacc_pt_merged %>%
filter(vacc_date >= ymd("2022-01-01") & vacc_date <= ymd("2022-12-31"))
# Filter vaccinations between 2015 and 2024
vacc_1524 <- vacc_pt_merged %>%
filter(vacc_service_year >= 2015 & vacc_service_year <= 2024)
table(vacc_1524$vacc_year)
2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
153 139 137 151 161 137 263 120 170 122
table(vacc_1524$race)
asian black native other white
111 94 13 29 1306
# table(vacc_1524$ethnicity)# ordering variables for visual checking
vacc_1524 <- vacc_1524 %>%
select(id, gender, race, birthdate, date, vacc_date, age_years, vacc_service_year, days_to_vax, weeks_to_vax,
months_to_vax, years_to_vax, vacc_day, vacc_month, vacc_month_name, vacc_year, vacc_weekday, vacc_week,
vacc_quarter, vacc_fy, vacc_season, is_weekend, days_since_ref, everything())Summary of vaccination pattern: Tabulation ———————————–
# Recode gender
vacc_1524 <- vacc_1524 %>%
mutate(gender = case_when(
gender == "F" ~ "Female",
gender == "M" ~ "Male",
TRUE ~ gender
))
# Convert time variables to character for consistency
vacc_1524 <- vacc_1524 %>%
mutate(across(c(vacc_service_year, vacc_quarter, vacc_weekday), as.character))
# Create and label each table
tab_year <- vacc_1524 %>%
tabyl(vacc_service_year, gender) %>%
adorn_totals("row") %>%
mutate(Category = "Year", .before = 1) %>%
rename(Label = vacc_service_year)
tab_quarter <- vacc_1524 %>%
tabyl(vacc_quarter, gender) %>%
adorn_totals("row") %>%
mutate(Category = "Quarter", .before = 1) %>%
rename(Label = vacc_quarter)
tab_weekday <- vacc_1524 %>%
tabyl(vacc_weekday, gender) %>%
adorn_totals("row") %>%
mutate(Category = "Weekday", .before = 1) %>%
rename(Label = vacc_weekday)
# Combine all
tab_combined <- bind_rows(tab_year, tab_quarter, tab_weekday)
# Calculate column proportions within each Category group (excluding "Total" row)
tab_combined <- tab_combined %>%
group_by(Category) %>%
mutate(
Female_Prop = round(100 * Female / sum(Female[Label != "Total"]), 1),
Male_Prop = round(100 * Male / sum(Male[Label != "Total"]), 1)
) %>%
ungroup()
# Format with flextable
ft <- flextable(tab_combined) %>%
set_header_labels(
Category = "Time Unit",
Label = "Category",
Female = "Female Count",
Male = "Male Count",
Female_Prop = "Female %",
Male_Prop = "Male %"
) %>%
add_header_lines(values = "Table: Vaccination counts and gender proportions (%) across Year, Quarter, and Weekday") %>%
theme_box() %>%
autofit()
ftTable: Vaccination counts and gender proportions (%) across Year, Quarter, and Weekday | |||||
|---|---|---|---|---|---|
Time Unit | Category | Female Count | Male Count | Female % | Male % |
Year | 2015 | 74 | 79 | 9.3 | 10.5 |
Year | 2016 | 77 | 62 | 9.6 | 8.2 |
Year | 2017 | 65 | 72 | 8.1 | 9.5 |
Year | 2018 | 97 | 54 | 12.2 | 7.2 |
Year | 2019 | 78 | 83 | 9.8 | 11.0 |
Year | 2020 | 61 | 76 | 7.6 | 10.1 |
Year | 2021 | 146 | 117 | 18.3 | 15.5 |
Year | 2022 | 61 | 59 | 7.6 | 7.8 |
Year | 2023 | 82 | 88 | 10.3 | 11.7 |
Year | 2024 | 57 | 65 | 7.1 | 8.6 |
Year | Total | 798 | 755 | 100.0 | 100.0 |
Quarter | 1 | 174 | 229 | 21.8 | 30.3 |
Quarter | 2 | 230 | 146 | 28.8 | 19.3 |
Quarter | 3 | 205 | 262 | 25.7 | 34.7 |
Quarter | 4 | 189 | 118 | 23.7 | 15.6 |
Quarter | Total | 798 | 755 | 100.0 | 100.0 |
Weekday | Friday | 69 | 111 | 8.6 | 14.7 |
Weekday | Monday | 117 | 60 | 14.7 | 7.9 |
Weekday | Saturday | 171 | 82 | 21.4 | 10.9 |
Weekday | Sunday | 153 | 124 | 19.2 | 16.4 |
Weekday | Thursday | 118 | 135 | 14.8 | 17.9 |
Weekday | Tuesday | 79 | 86 | 9.9 | 11.4 |
Weekday | Wednesday | 91 | 157 | 11.4 | 20.8 |
Weekday | Total | 798 | 755 | 100.0 | 100.0 |
📝 Summary of Vaccination Uptake by Gender and Time Period
The table Aabove presents vaccination counts and gender proportions across different time units—year, quarter, and weekday—for individuals aged 15–24 years.
By Year
2021 recorded the highest vaccination uptake for both females (146; 18.3%) and males (117; 15.5%), indicating a peak in vaccine distribution during that year.
The lowest uptake was observed in 2017 for females (65; 8.1%) and in 2016 for males (62; 8.2%).
Overall, vaccination was more evenly distributed across years, but there was a noticeable increase in 2021, likely due to public health initiatives during the COVID-19 pandemic.
By Quarter
For females, the highest uptake was in Quarter 2 (230; 28.8%), followed by Quarter 3 (205; 25.7%).
For males, Quarter 3 (262; 34.7%) had the highest uptake, while Quarter 2 (146; 19.3%) was lower in comparison.
Quarter 1 showed the lowest female uptake proportionally (21.8%), while Quarter 4 had the lowest for males (15.6%).
By Weekday
Wednesday (157; 20.8%) was the peak for males.
Friday (14.7%) and Thursday (17.9%) were more popular among males.
Monday had the lowest male uptake (7.9%), while Tuesday (9.9%) was the lowest for females.
📊 Visualizing Vaccination Trends
Time-based visualizations help us uncover trends, seasonality, and variation in vaccination data. Below are some examples of how to visualize date information using the vacc_pt_merged dataset. These plots support evidence-based insights and can guide public health decision-making.
Overall Vaccination Patterns by by Gender and Race
2021 was a notable year across most racial groups, with peaks in vaccination uptake, likely due to intensified COVID-19 vaccine rollouts.
The “Other” racial group showed the most fluctuation with a sharp spike in 2023.
The White group had the most consistent trend, while Asian and Black groups showed spikes at specific periods.
Females showed a sharper peak than males in 2021 but also had a steeper decline afterward.
Males had more consistent trends overall, especially from 2015–2020 and 2022–2024.
The post-2021 decline in both groups may suggest reduced urgency or campaign intensity in recent years
Post-2021, a general decline across all groups suggests possible vaccine fatigue or the end of mass campaign periods.
Variations of vaccination uptake by gender and race are evident, suggesting differing levels of access, trust, or outreach effectiveness across communities.
Monthly Vaccination Trend in each Year
Use a line plot to visualize how the number of vaccinations changes over time.
The time series reveals longitudinal patterns, showing potential campaign effects (peaks) or service interruptions (troughs). The upward trend in mid 2021 suggests successful program for vaccination.
As shown in the monthly vaccination pattern,
A bi-modal pattern is evident with two peaks (March and August) and two noticeable dips (June and November).
March and August are peak months — possibly due to public health campaigns.
November was the lowest — possibly due to holiday distractions, reduced outreach, or end-of-year slowdowns.
vacc_1524 %>%
count(vacc_year, vacc_month) %>%
ggplot(aes(x = as.Date(paste(vacc_year, vacc_month, "01", sep = "-")),
y = n)) +
geom_line(color = "#1f78b4", linewidth = 1) +
geom_point(color = "#33a02c") +
labs(title = "Monthly Vaccination Trends",
x = "Timeline", y = "Vaccinations") +
scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))vacc_1524 %>%
mutate(month = month(vacc_date, label = TRUE, abbr = TRUE)) %>% # Extract month from date
group_by(month) %>%
summarise(count = n(), .groups = "drop") %>%
ggplot(aes(x = month, y = count, group = 1)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 2, color = "steelblue") +
labs(
title = "Total Vaccinations by Month",
x = "Month",
y = "Vaccination Count"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)Weekday Heatmap
The heatmap identifies temporal service patterns. Higher intensity on Wednesdays/Thursdays may reflect clinic scheduling norms, while weekend gaps could indicate access barriers.
vacc_1524 %>%
count(vacc_month_name, vacc_weekday) %>%
ggplot(aes(x = vacc_weekday, y = vacc_month_name, fill = n)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "#f7fbff", high = "#08306b") +
labs(title = "Vaccination Intensity by Weekday and Month",
x = "Weekday", y = "Month") +
theme_minimal() +
theme(legend.position = "bottom")Seasonal Vaccination Distribution
Explore how vaccination services vary by season.
vacc_1524 %>%
mutate(month = as.numeric(format(vacc_date, "%m")),
season = case_when(
month %in% c(12, 1, 2) ~ "Summer",
month %in% c(3, 4, 5) ~ "Autumn",
month %in% c(6, 7, 8) ~ "Winter",
month %in% c(9, 10, 11) ~ "Spring"
)) %>%
count(season) %>%
ggplot(aes(x = season, y = n, fill = season)) +
geom_col() +
labs(
title = "Seasonal Vaccination Distribution",
x = "Season",
y = "Vaccination Count"
) +
theme_minimal()Vaccination proportion by year and race
If we’d like a stacked bar plot of the proportions
# Proportion by vacc_year and race with cleaned race labels
df_plot_race <- vacc_1524 %>%
mutate(
race = ifelse(race %in% c("native", "other"), "Other", race),
race = str_to_title(race),
vacc_year = as.integer(floor(as.numeric(vacc_year))) # Ensure whole number
) %>%
group_by(race, vacc_year) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(race) %>%
mutate(
total = sum(count),
proportion = (count / total) * 100
) %>%
ungroup()
# Line plot
ggplot(df_plot_race, aes(x = vacc_year, y = proportion, color = race)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_continuous(breaks = seq(min(df_plot_race$vacc_year), max(df_plot_race$vacc_year), by = 2)) +
labs(
title = "Vaccination Proportion by Race and Year",
x = "Vaccination Year",
y = "Proportion (%)",
color = "Race"
) +
theme_minimal() +
theme(
legend.position = "right",
plot.title = element_text(hjust = 0.5)
)# Bar plot
ggplot(df_plot_race, aes(x = vacc_year, y = proportion, fill = race)) +
geom_bar(stat = "identity", position = "stack") +
scale_x_continuous(breaks = seq(min(df_plot_race$vacc_year), max(df_plot_race$vacc_year), by = 2)) +
labs(
title = "Vaccination Proportion by Race and Year",
x = "Vaccination Year",
y = "Proportion (%)",
fill = "Race"
) +
theme_minimal()# Proportion by vacc_year and gender
vacc_1524 %>%
mutate(
gender = case_when(
gender == "M" ~ "Male",
gender == "F" ~ "Female",
TRUE ~ gender
),
vacc_year = as.integer(vacc_year) # Ensure year is whole number
) %>%
group_by(gender, vacc_year) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(gender) %>%
mutate(
total = sum(count),
proportion = (count / total) * 100
) %>%
ungroup() %>%
ggplot(aes(x = vacc_year, y = proportion, color = gender)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_continuous(breaks = scales::pretty_breaks()) + # Ensures whole number years
labs(
title = "Vaccination Proportion by Gender and Year",
x = "Vaccination Year",
y = "Proportion (%)",
color = "Gender"
) +
scale_color_manual(values = c("Male" = "#1f77b4", "Female" = "#ff7f0e")) +
theme_minimal() +
theme(
legend.position = "right",
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12),
legend.title = element_text(size = 12)
)Vaccination proportion by month and year
Summary and Key Takeaways
Key techniques covered:
Proper date conversion from character strings
Accurate age and time difference calculations
Seasonal and temporal pattern analysis
Visualization of date-based trends
Survival analysis for time-to-event data
Best practices:
Always validate date conversions
Document time units clearly (days vs. months vs. years)
Handle missing dates explicitly
Consider time zones for multi-center studies
References
- https://CRAN.R-project.org/package=dplyr
- https://www.jstatsoft.org/article/view/v103i01
- https://r4ds.hadley.nz/