# ================================
# script: Trip Statistics - Data Preparation and Analysis 2022-2024
# Author: Sven Lißner
# ================================

# Load Libraries

library(tidyverse)
library (readxl)
library(rstatix)
library(coin)
library(psych)
library(rmarkdown)
library(writexl)
library(ggpubr)
library(dplyr)
library(ggplot2)
library(patchwork)
library(scales)


 # Step 0

  # loading the data sets (using relative paths)
  # Assuming your script is in the root or a 'scripts' folder
  # and data is in the 'Data' folder


city_stats <-read.csv(file = 'Data/city_stats_short.csv', header = TRUE, sep = ";")

all_tripstats_2022 <- read.csv(file = 'Data/trip_stats_2022.csv',header=TRUE, sep = ";")
all_tripstats_2023 <- read.csv(file = 'Data/trip_stats_2023.csv',header=TRUE, sep = ";")
all_tripstats_2024 <- read.csv(file = 'Data/trip_stats_2024.csv',header=TRUE, sep = ";")

# Step 1. Data Statistics

combined_tripstats <- rbind(
  mutate(all_tripstats_2022, year = 2022),
  mutate(all_tripstats_2023, year = 2023),
  mutate(all_tripstats_2024, year = 2024)
) # combination of all data sets on trip basis

  # Step 1.1: basic data analysis on personal basis (unique) for every year (filtering unreasonable ages)

cyclists_unique_all <- combined_tripstats %>% distinct(device_id, .keep_all = TRUE)
cyclists_unique_2022 <- all_tripstats_2022 %>% distinct(device_id, .keep_all = TRUE)%>%filter(age >= 10 & age <= 90)
cyclists_unique_2023 <- all_tripstats_2023 %>% distinct(device_id, .keep_all = TRUE)%>%filter(age >= 10 & age <= 90)
cyclists_unique_2024 <- all_tripstats_2024 %>% distinct(device_id, .keep_all = TRUE)%>%filter(age >= 10 & age <= 90)

cyclists_all_years_combined <- rbind(
  mutate(cyclists_unique_2022, year = 2022),
  mutate(cyclists_unique_2023, year = 2023),
  mutate(cyclists_unique_2024, year = 2024)
) # all unique people

  # Step 1.1.1 Personal statistics per year

  comparison_table_cyclists <- cyclists_all_years_combined %>%
  mutate(
      gender_clean = case_when(
      gender == "FEMALE" ~ "female",
      gender == "MALE"   ~ "male",
      TRUE               ~ "NA" # /NA, -, nan etc.
    )
  ) %>%
  group_by(year, gender_clean) %>%
  summarise(
    Number = n(),
    Mean_age = mean(age, na.rm = TRUE),
    SD_Alter = sd(age, na.rm = TRUE),
    .groups = "drop"
  )

  print(comparison_table_cyclists)

# Step 1.1.2 Create Pie-Charts of comparison table

  pie_plot_data <- comparison_table_cyclists %>%
  group_by(year)%>%
  mutate(
    # compute percentage (0 to 1)
    perc = Number / sum(Number),
    label_text = ifelse(perc > 0.02, paste0(round(perc * 100), "%"), "")
  ) %>%
  ungroup()

ggplot(pie_plot_data, aes(x = "", y = Number, fill = gender_clean)) +
    geom_col(position = "fill", width = 1, color = "gray20") +
    coord_polar(theta = "y", start = 0) +
    facet_wrap(~year) +
    geom_text(aes(label = label_text), 
            position = position_fill(vjust = 0.5), 
            color = "white", 
            fontface = "bold", 
            size = 3.5) +
    scale_fill_viridis_d(option = "mako", begin = 0.3, end = 0.7, name = "Gender") +
    theme_void() + 
  theme(
    strip.text = element_text(face = "bold", size = 14, margin = margin(b = 10)),
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16, margin = margin(t = 10, b = 20)),
    plot.margin = margin(10, 10, 10, 10, "pt")
  ) +
  labs(title = "Gender Distribution")

# Step 1.1.3 Create violin plot for results of all years

cyclists_combined <- cyclists_all_years_combined %>%
  filter(gender %in% c("MALE","FEMALE")) %>%
    mutate(
    year = as.factor(year), # year as category
    gender_clean = case_when(
      gender == "FEMALE" ~ "female",
      gender == "MALE"   ~ "male" # only using 2 categories, because NA is comparably small
      
    )
  ) %>%
  # Diagramm 
  ggplot(aes(x = gender_clean, y = age, fill = gender_clean)) +
  # Violin shows Kernel Density of Age  Boxplot shows Numbers
  geom_violin(trim = TRUE, alpha = 0.5) +
  geom_boxplot(width = 0.1, color = "black", outlier.shape = NA, alpha = 0.8) +
  # Faceting: one diagram per year
    facet_wrap(~year) +
  # altering scale
  scale_y_continuous(
    breaks = seq(0, 100, by = 10), # Lines every 10 years (0, 10, 20...)
    limits = c(10, NA)             # starts at 10  (age Filter)
  ) +
  # altering design
  scale_fill_viridis_d(option = "mako", name = "Gender", begin =0.3, end = 0.7) + 
  theme_minimal() +
  theme(
    text = element_text(size = 12),
    axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)),
    axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0)),
    strip.text = element_text(face = "bold", size = 13),
    axis.text.x = element_text(angle = 0, hjust = 0.5), 
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16, margin = margin(t = 10, b = 20)),
    legend.position = "none",
    panel.grid.minor = element_blank()
  ) +
  labs(
    title = "Age Distribution in Study Area by Gender",
    x = "Gender",
    y = "Age (years)"
  )

print(cyclists_combined)



# Step 1.2: Data preparing for analysis over all years, including trips (distance ->km, gender cleaning)

results_combined <- combined_tripstats %>%filter(age >= 10 & age <= 90) %>% filter(mode_type == 3) %>%
    mutate(
    distance_clean = as.numeric(gsub(",", ".", as.character(.data$distance))),
    distance_km = distance_clean / 1000,
    gender_clean = case_when(
      gender == "FEMALE" ~ "female",
      gender == "MALE"   ~ "male",
      TRUE               ~ "Other/Diverse" # all other like /NA, empty collumns etc.
    )
  ) %>%
  
  # 1.2.1a aggregation on personal trip level
  group_by(device_id, year) %>%
  summarise(
    gender_fixed = first(gender_clean),
    age = first(age),
    total_km_Person = sum(distance_km, na.rm = TRUE),
    avg_km_per_trip = mean(distance_km, na.rm = TRUE),
    Number_Trips = n()
  ) %>%
  # 1.2.1b analyse for gender
  group_by(gender_fixed, year) %>%
  summarise(
    number_persons = n(),
    total_trips_absolute = sum(Number_Trips),
    avg_trips_per_person = mean(Number_Trips),
    mean_age = mean(age),
    mean_km_trips_grp = mean(avg_km_per_trip),
    sd_km_trips_grp   = sd(avg_km_per_trip, na.rm = TRUE),
    total_km_of_group = sum(total_km_Person)
  )

print(results_combined)

#Step 1.2.2 plot results of all years -> avg speed histogram


plot_data_speed <- combined_tripstats %>%
  filter(mode_type == 3) %>% 
  mutate(
    avg_speed_kmh = speed_avg * 3.6 
  )
avg_val <- weighted.mean(plot_data_speed$avg_speed_kmh, 
                         w = plot_data_speed$distance, 
                         na.rm = TRUE
  )

ggplot(plot_data_speed, aes(x = avg_speed_kmh)) +
  geom_histogram(
    breaks = seq(0, 34, by = 1), 
    fill = viridis::viridis(n = 10, option = "mako")[4], 
    color = "white",  
    alpha = 0.9
  ) +
  # Vertical Line for weighted average speed
  geom_vline(aes(xintercept = avg_val), 
             color = "#E74C3C",      
             linetype = "dashed",    
             linewidth = 1) +
  # Optional: text label for the line
  annotate("text", 
           x = avg_val + 1,          
           y = 900000,              
           label = paste0("Ø ", round(avg_val, 1), " km/h"), 
           color = "#E74C3C", 
           fontface = "bold", 
           hjust = 0) +
  scale_y_continuous(
    breaks = seq(0, 5000000, by = 200000),
    labels = label_number(scale = 1e-6, suffix = "M", big.mark = ".", decimal.mark = ",")
  ) +
  scale_x_continuous(
    breaks = seq(0, 34, by = 2),
        labels = seq(0, 34, by = 2) 
  ) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    axis.title.y = element_text(margin = margin(r = 15)),
    axis.title.x = element_text(margin = margin(t = 10))
  ) +
  labs(
    title = "Distribution of Avg. Trip Speed",
    x = "Speed (km/h)",
    subtitle = "Red line represents the distance-weighted average speed",
    y = "Number of Trips (in Millions)"
  )

# Step 1.2.3 Comparison of weighted average speed over the years

  weighted_avg_total <- plot_data_speed %>%
  summarise(
    weighted_speed = weighted.mean(avg_speed_kmh, w = distance, na.rm = TRUE)
  )

  cat("Gewichteter Geschwindigkeitsdurchschnitt:", round(weighted_avg_total$weighted_speed, 2), "km/h\n")
  speed_comparison_weighted <- plot_data_speed %>%
    group_by(year) %>%
    summarise(
      unweighted_mean = mean(avg_speed_kmh, na.rm = TRUE),
      weighted_mean = weighted.mean(avg_speed_kmh, w = distance, na.rm = TRUE),
      total_distance_km = sum(distance) / 1000
  )

print(speed_comparison_weighted)

# Step 1.2.4 plot results of all years -> avg distance histogram

  plot_data_dist <- combined_tripstats %>%
    filter(mode_type == 3) %>% # only "normal" rides, excluding sport rides.
    mutate(
      dist_km = distance / 1000,
      dist_capped = ifelse(dist_km >= 20, 20, dist_km)
  )
  ggplot(plot_data_dist, aes(x = dist_capped)) +
    geom_histogram(
      breaks = seq(0, 20, by = 1), 
      fill = viridis::viridis(n = 10, option = "mako")[4], 
      color = "white",  
      alpha = 0.9
  ) +
  scale_y_continuous(
    breaks = seq(0, 5000000, by = 200000),
    labels = label_number(scale = 1e-6, suffix = "M", big.mark = ".", decimal.mark = ",")
  ) +
  scale_x_continuous(
    breaks = seq(0, 20, by = 2),
    labels = c(seq(0, 18, by = 2), "20+") 
  ) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    axis.title.y = element_text(margin = margin(r = 15)),
    axis.title.x = element_text(margin = margin(t = 10)),
    strip.text = element_text(face = "bold", size = 12)
  ) +
  labs(
    title = "Distribution of Trip Distances",
    subtitle = "Trips >20km are aggregated in the 20km bin",
    x = "Distance (km)",
    y = "Number of Trips"
  )

  # Step 1.2.5 Analysing sports cycling

results_combined_sport <- combined_tripstats %>%filter(age >= 10 & age <= 90) %>% filter(mode_type == 2) %>%
  mutate(
    distance_clean = as.numeric(gsub(",", ".", as.character(.data$distance))),
    distance_km = distance_clean / 1000,
    gender_clean = case_when(
      gender == "FEMALE" ~ "female",
      gender == "MALE"   ~ "male",
      TRUE               ~ "Other/Diverse" # all other like /NA, empty collumns etc.
    )
  ) %>%
  
  # 1.2.5a aggregation on personal trip level
  group_by(device_id, year) %>%
  summarise(
    gender_fixed = first(gender_clean),
    age = first(age),
    total_km_Person = sum(distance_km, na.rm = TRUE),
    avg_km_per_trip = mean(distance_km, na.rm = TRUE),
    Number_Trips = n()
  ) %>%
  # 1.2.5b analyse for gender
  group_by(gender_fixed, year) %>%
  summarise(
    number_persons = n(),
    total_trips_absolute = sum(Number_Trips),
    avg_trips_per_person = mean(Number_Trips),
    mean_age = mean(age),
    mean_km_trips_grp = mean(avg_km_per_trip),
    sd_km_trips_grp   = sd(avg_km_per_trip, na.rm = TRUE),
    total_km_of_group = sum(total_km_Person)
  )
print(results_combined_sport)



# Step 2.0 data pre processing and filtering, exemplary for one year 
  # Mode type 3 denotes cycling without sports cycling tracks, duplicates removed,
  # examplary generation of new variables like avg_speed_tt (distance/duration*3.6), min/km (minutes per kilometer cycled) and converting of time-stamps

processed_tripstats2022 <-read.csv(file = 'trip_stats_2022.csv',header=TRUE, sep = ";") %>% 
  filter(gender =='FEMALE' & `speed_v50` >= 1.5 & mode_type == '3' | gender == 'MALE' & `speed_v50` >= 1.5 & mode_type == '3') %>% #only female and male trips without sports cycling (4) 
  distinct() %>% mutate(waiting_events_km = `waiting_events_count` / (`distance` / 1000)) %>% # creating waiting events per kilometre
  mutate(avg_speed_tt = (distance / (duration)*3.6)) %>% mutate(min_km = (duration/60) / (distance/1000)) %>% # creating average speed and mintes per kilometre cycled
  mutate(acc_km = (accelerations_pos_count) / (distance/1000)) %>% mutate(dec_km = (accelerations_neg_count) / (distance/1000)) %>% # creating accelerations and decelerations per kilomtere cycled
  mutate(avg_acc_time = (accelerations_pos_total_time) / (accelerations_pos_count)) %>% # creating average time for an acceleration
  mutate(avg_dec_time = (accelerations_neg_total_time) / (accelerations_neg_count)) %>% # creating average time for a deceleration
  filter(distance >= 500 & waiting_events_km <10) %>% # Filter for trip length and number of waiting events per kilometre
  mutate(waiting_time_km = (waiting_events_total_duration) / (distance/1000)) %>% # creating waiting_time_km
  mutate(v85_time = distance/speed_v85)  %>% mutate(total_timeloss = (duration-v85_time)) %>% filter(total_timeloss >= 0) %>% # creating time loss / delay
  filter(!(distance<600 & detour_factor >= 5)) %>%  # filter for short trips with a high detoru factor (mostly artefacts)
  mutate(total_timeloss_km = ((duration-v85_time)/(distance/1000))) %>% # creating total time loss or bike delay per kilometre
  mutate(movement_timeloss = (total_timeloss-waiting_events_total_duration)) %>% # creating timeloss in movement as seperate variable
  mutate(movement_timeloss_km = ((total_timeloss-waiting_events_total_duration)/(distance/1000))) %>% # timeloss in movement per kilometre
  mutate(datetime_start = as.POSIXct(start_time, origin = "1970-01-01", tz = "Europe/Berlin"), weekday = format(datetime_start, "%A")) %>% 
  relocate(datetime_start, weekday, .after = end_time) # altering time-date format.

# Step 2.2 if you want to add age groups for statistics (used age groups are oriented on life phases)

processed_tripstats2022$agegroup <- processed_tripstats2022$age
processed_tripstats2022$agegroup[processed_tripstats2022$agegroup < 16] <- 1
processed_tripstats2022$agegroup[processed_tripstats2022$agegroup >= 16 & processed_tripstats2022$agegroup < 30 ] <- 2
processed_tripstats2022$agegroup[processed_tripstats2022$agegroup >= 30 & processed_tripstats2022$agegroup < 45 ] <- 3
processed_tripstats2022$agegroup[processed_tripstats2022$agegroup >= 45 & processed_tripstats2022$agegroup < 55 ] <- 4
processed_tripstats2022$agegroup[processed_tripstats2022$agegroup >= 55 & processed_tripstats2022$agegroup < 65 ] <- 5
processed_tripstats2022$agegroup[processed_tripstats2022$agegroup >= 65] <- 6