Load installed packages

library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)
library(tidyr)

Load CSV files

daily_activity <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
sleep_day <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
hourly_intensities <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv")
hourly_calories <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv")
weight_log <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")

Review head data of loaded CSV files

head(daily_activity)
head(sleep_day)
head(hourly_intensities)
head(hourly_calories)
head(weight_log)

Review column names of loaded CSV files

colnames(daily_activity)
 [1] "Id"                       "ActivityDate"            
 [3] "TotalSteps"               "TotalDistance"           
 [5] "TrackerDistance"          "LoggedActivitiesDistance"
 [7] "VeryActiveDistance"       "ModeratelyActiveDistance"
 [9] "LightActiveDistance"      "SedentaryActiveDistance" 
[11] "VeryActiveMinutes"        "FairlyActiveMinutes"     
[13] "LightlyActiveMinutes"     "SedentaryMinutes"        
[15] "Calories"                
colnames(sleep_day)
[1] "Id"                 "SleepDay"           "TotalSleepRecords" 
[4] "TotalMinutesAsleep" "TotalTimeInBed"    
colnames(hourly_intensities)
[1] "Id"               "ActivityHour"     "TotalIntensity"   "AverageIntensity"
colnames(hourly_calories)
[1] "Id"           "ActivityHour" "Calories"    
colnames(weight_log)
[1] "Id"             "Date"           "WeightKg"       "WeightPounds"  
[5] "Fat"            "BMI"            "IsManualReport" "LogId"         

Fix date mismatches

The daily_activity contains just a date but the others contain a date and time. I will need to seperate the date and time into their own columns in these other data frames. Additionally, the data type for each date column will need to be set to a date format as well:

sleep_day <- sleep_day %>% 
  mutate(
    SleepDay = mdy_hms(SleepDay),  # Parse datetime (adjust format if needed)
    Date = as.Date(SleepDay),
  )
head(sleep_day)

daily_activity <- daily_activity %>% 
  mutate(
    ActivityDate = mdy(ActivityDate),
    Date = as.Date(ActivityDate)
  )

hourly_intensities <- hourly_intensities %>% 
  mutate(
    ActivityHour = mdy_hms(ActivityHour),  # Parse datetime (adjust format if needed)
    Date = as.Date(ActivityHour),
    Time = format(ActivityHour, "%H:%M")
  )

hourly_calories <- hourly_calories %>% 
  mutate(
    ActivityHour = mdy_hms(ActivityHour),  # Parse datetime (adjust format if needed)
    Date = as.Date(ActivityHour),
    Time = format(ActivityHour, "%H:%M")
  )

weight_log <- weight_log %>% 
  mutate(
    Date = mdy_hms(Date),  # Parse datetime (adjust format if needed)
    Date = as.Date(Date)
  )

Summary Statistics

How many unique participants are there in each dataframe?

n_distinct(daily_activity$Id)
[1] 33
n_distinct(sleep_day$Id)
[1] 24
n_distinct(hourly_intensities$Id)
[1] 33
n_distinct(hourly_calories$Id)
[1] 33
n_distinct(weight_log$Id)
[1] 8

How many observations are there in each dataframe?

nrow(daily_activity)
[1] 940
nrow(sleep_day)
[1] 413
nrow(hourly_intensities)
[1] 22099
nrow(hourly_calories)
[1] 22099
nrow(weight_log)
[1] 67

daily_activity dataframe:

daily_activity %>%  
  select(TotalSteps,
         TotalDistance,
         SedentaryMinutes,
         Calories) %>%
  summary()
   TotalSteps    TotalDistance    SedentaryMinutes    Calories   
 Min.   :    0   Min.   : 0.000   Min.   :   0.0   Min.   :   0  
 1st Qu.: 3790   1st Qu.: 2.620   1st Qu.: 729.8   1st Qu.:1828  
 Median : 7406   Median : 5.245   Median :1057.5   Median :2134  
 Mean   : 7638   Mean   : 5.490   Mean   : 991.2   Mean   :2304  
 3rd Qu.:10727   3rd Qu.: 7.713   3rd Qu.:1229.5   3rd Qu.:2793  
 Max.   :36019   Max.   :28.030   Max.   :1440.0   Max.   :4900  

sleep_day dataframe:

sleep_day %>%  
  select(TotalSleepRecords,
         TotalMinutesAsleep,
         TotalTimeInBed) %>%
  summary()
 TotalSleepRecords TotalMinutesAsleep TotalTimeInBed 
 Min.   :1.000     Min.   : 58.0      Min.   : 61.0  
 1st Qu.:1.000     1st Qu.:361.0      1st Qu.:403.0  
 Median :1.000     Median :433.0      Median :463.0  
 Mean   :1.119     Mean   :419.5      Mean   :458.6  
 3rd Qu.:1.000     3rd Qu.:490.0      3rd Qu.:526.0  
 Max.   :3.000     Max.   :796.0      Max.   :961.0  

hourly_intensities dataframe:

hourly_intensities %>%  
  select(TotalIntensity,
         AverageIntensity) %>%
  summary()
 TotalIntensity   AverageIntensity
 Min.   :  0.00   Min.   :0.0000  
 1st Qu.:  0.00   1st Qu.:0.0000  
 Median :  3.00   Median :0.0500  
 Mean   : 12.04   Mean   :0.2006  
 3rd Qu.: 16.00   3rd Qu.:0.2667  
 Max.   :180.00   Max.   :3.0000  

hourly_calories dataframe:

hourly_calories %>%  
  select(Calories) %>%
  summary()
    Calories     
 Min.   : 42.00  
 1st Qu.: 63.00  
 Median : 83.00  
 Mean   : 97.39  
 3rd Qu.:108.00  
 Max.   :948.00  

Joining sleep_day and daily_activity

activity_sleep <- full_join(sleep_day, daily_activity, by=c("Id", "Date"))

Verify Distinct Id Count After Join

n_distinct(activity_sleep$Id)
[1] 33

Visualization

ggplot(data=daily_activity, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point()+ geom_smooth()+ labs(title="Total Steps vs. Sedentary Minutes", y="Total Sedentary Minutes", x="Total Steps")

ggplot(data=daily_activity, aes(x=TotalSteps, y=Calories)) + geom_point() + geom_smooth()+ labs(title="Total Steps vs. Calories", x="Total Steps")

ggplot(data=sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point()+ geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Time In Bed", y="Total Time in Bed", x="Total Sleep Minutes")

ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=TotalSteps)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Steps", y="Total Steps", x="Total Sleep Minutes")

ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=SedentaryMinutes)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Sedentary Minutes", y="Total Sedentary Minutes", x="Total Sleep Minutes")

ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=Calories)) + geom_point() + geom_smooth() + labs(title="Total Sleep Minutes vs. Calories", x="Total Sleep Minutes")


grouped_hourly_intensities <- hourly_intensities %>%
  group_by(Time) %>%
  drop_na() %>%
  summarise(mean_intensity = mean(TotalIntensity))

ggplot(data=grouped_hourly_intensities, aes(x=Time, y=mean_intensity)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Total Intensity vs. Time", y="Average Total Intensity")


grouped_hourly_calories <- hourly_calories %>%
  group_by(Time) %>%
  drop_na() %>%
  summarise(mean_calories = mean(Calories))

ggplot(data=grouped_hourly_calories, aes(x=Time, y=mean_calories)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Calories vs. Time", y="Mean Calories")

Correlations

# Correlations with p-values
# Sedentary vs Sleep
cor_test_sed_sleep <- cor.test(activity_sleep$SedentaryMinutes, activity_sleep$TotalMinutesAsleep)
print(cor_test_sed_sleep)  # r and p-value

    Pearson's product-moment correlation

data:  activity_sleep$SedentaryMinutes and activity_sleep$TotalMinutesAsleep
t = -15.181, df = 411, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.6578402 -0.5337719
sample estimates:
      cor 
-0.599394 
# Steps vs Sedentary
cor_test_steps_sed <- cor.test(daily_activity$TotalSteps, daily_activity$SedentaryMinutes)
print(cor_test_steps_sed)

    Pearson's product-moment correlation

data:  daily_activity$TotalSteps and daily_activity$SedentaryMinutes
t = -10.615, df = 938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.3833971 -0.2691782
sample estimates:
       cor 
-0.3274835 
# Steps vs Calories
cor_test_steps_cal <- cor.test(daily_activity$TotalSteps, daily_activity$Calories)
print(cor_test_steps_cal)

    Pearson's product-moment correlation

data:  daily_activity$TotalSteps and daily_activity$Calories
t = 22.472, df = 938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5483688 0.6316184
sample estimates:
      cor 
0.5915681 
# Sleep vs Time in Bed
cor_test_sleep_bed <- cor.test(sleep_day$TotalMinutesAsleep, sleep_day$TotalTimeInBed)
print(cor_test_sleep_bed)

    Pearson's product-moment correlation

data:  sleep_day$TotalMinutesAsleep and sleep_day$TotalTimeInBed
t = 51.483, df = 411, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9162253 0.9423445
sample estimates:
      cor 
0.9304575 
# Sleep efficiency (ratio) for insomnia outliers
sleep_day$SleepEfficiency <- sleep_day$TotalMinutesAsleep / sleep_day$TotalTimeInBed
mean_eff <- mean(sleep_day$SleepEfficiency, na.rm = TRUE)
std_eff <- sd(sleep_day$SleepEfficiency, na.rm = TRUE)
outliers <- sleep_day[sleep_day$SleepEfficiency < (mean_eff - 2 * std_eff), ]
print(paste("Mean Efficiency:", round(mean_eff, 3), "Std:", round(std_eff, 3), "Outliers:", nrow(outliers)))
[1] "Mean Efficiency: 0.917 Std: 0.087 Outliers: 27"
# Visualize efficiency
ggplot(sleep_day, aes(x = SleepEfficiency)) + geom_histogram(binwidth = 0.01, color="darkblue", fill="gray") + 
  labs(title = "Distribution of Sleep Efficiency", y = "Number of Sleep Records", x="Sleep Efficiency")

Data Limitations

Key Insights From Data Analysis:

  1. 5pm-7pm are the most active hours of the day as confirmed by the comparisons of Average Total Intensity and Calories burned to Time.
  2. 12pm-2pm is the second most active time period.
  3. People with higher sedentary minutes had less total minutes asleep.
  4. Total sleep minutes increase with total time in bed so outliers that spend more time in bed but get less sleep may be having insomnia or poor sleep hygiene.
  5. People that took the most steps generally burned the most calories
  6. There is a unique relationship between total steps and sedentary minutes. Sedentary minutes decrease with more total steps up to about 10,000 steps. There is an inflection point here where more stpes after 10,000 tends to lead to an increase in sedentary minutes which may indicate fatigue or needed rest.

Recomendations:

  1. Send activity related notification reminders to users to be active prior to the 5pm-7pm most active window, especially if they have not been active earlier in the day.
  2. Send notifications to people with less total minutes slept to improve sleep by being more active throughout the day.
  3. Send notifications to outlying users who spend more time in bed but get less sleep than the average with sleep hygiene tips since they might be experiencing insomnia.
  4. Send notifications to people that exceed 10,000 steps per day to remind them that they might need more recovery and rest activities.
---
title: "BellaBeat Case Study"
output:
  html_notebook: default
  pdf_document: default
---
### Load installed packages
```{r}
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)
library(tidyr)
```
### Load CSV files
```{r}
daily_activity <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
sleep_day <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
hourly_intensities <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv")
hourly_calories <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv")
weight_log <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")
```
### Review head data of loaded CSV files
```{r}
head(daily_activity)
head(sleep_day)
head(hourly_intensities)
head(hourly_calories)
head(weight_log)
```
### Review column names of loaded CSV files
```{r}
colnames(daily_activity)
colnames(sleep_day)
colnames(hourly_intensities)
colnames(hourly_calories)
colnames(weight_log)
```
### Fix date mismatches
The daily_activity contains just a date but the others contain a date and time. I will need to seperate the date and time into their own columns in these other data frames. Additionally, the data type for each date column will need to be set to a date format as well:
```{r}
sleep_day <- sleep_day %>% 
  mutate(
    SleepDay = mdy_hms(SleepDay),  # Parse datetime (adjust format if needed)
    Date = as.Date(SleepDay),
  )
head(sleep_day)

daily_activity <- daily_activity %>% 
  mutate(
    ActivityDate = mdy(ActivityDate),
    Date = as.Date(ActivityDate)
  )

hourly_intensities <- hourly_intensities %>% 
  mutate(
    ActivityHour = mdy_hms(ActivityHour),  # Parse datetime (adjust format if needed)
    Date = as.Date(ActivityHour),
    Time = format(ActivityHour, "%H:%M")
  )

hourly_calories <- hourly_calories %>% 
  mutate(
    ActivityHour = mdy_hms(ActivityHour),  # Parse datetime (adjust format if needed)
    Date = as.Date(ActivityHour),
    Time = format(ActivityHour, "%H:%M")
  )

weight_log <- weight_log %>% 
  mutate(
    Date = mdy_hms(Date),  # Parse datetime (adjust format if needed)
    Date = as.Date(Date)
  )
```
## Summary Statistics
### How many unique participants are there in each dataframe?
```{r}
n_distinct(daily_activity$Id)
n_distinct(sleep_day$Id)
n_distinct(hourly_intensities$Id)
n_distinct(hourly_calories$Id)
n_distinct(weight_log$Id)
```
### How many observations are there in each dataframe?
```{r}
nrow(daily_activity)
nrow(sleep_day)
nrow(hourly_intensities)
nrow(hourly_calories)
nrow(weight_log)
```
### daily_activity dataframe:
```{r}
daily_activity %>%  
  select(TotalSteps,
         TotalDistance,
         SedentaryMinutes,
         Calories) %>%
  summary()
```
### sleep_day dataframe:
```{r}
sleep_day %>%  
  select(TotalSleepRecords,
         TotalMinutesAsleep,
         TotalTimeInBed) %>%
  summary()
```
### hourly_intensities dataframe:
```{r}
hourly_intensities %>%  
  select(TotalIntensity,
         AverageIntensity) %>%
  summary()
```
### hourly_calories dataframe:
```{r}
hourly_calories %>%  
  select(Calories) %>%
  summary()
```
## Joining sleep_day and daily_activity
```{r}
activity_sleep <- full_join(sleep_day, daily_activity, by=c("Id", "Date"))
```
### Verify Distinct Id Count After Join
```{r}
n_distinct(activity_sleep$Id)
```
## Visualization 
```{r}
ggplot(data=daily_activity, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point()+ geom_smooth()+ labs(title="Total Steps vs. Sedentary Minutes", y="Total Sedentary Minutes", x="Total Steps")
ggplot(data=daily_activity, aes(x=TotalSteps, y=Calories)) + geom_point() + geom_smooth()+ labs(title="Total Steps vs. Calories", x="Total Steps")
ggplot(data=sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point()+ geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Time In Bed", y="Total Time in Bed", x="Total Sleep Minutes")
ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=TotalSteps)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Steps", y="Total Steps", x="Total Sleep Minutes")
ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=SedentaryMinutes)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Sedentary Minutes", y="Total Sedentary Minutes", x="Total Sleep Minutes")
ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=Calories)) + geom_point() + geom_smooth() + labs(title="Total Sleep Minutes vs. Calories", x="Total Sleep Minutes")

grouped_hourly_intensities <- hourly_intensities %>%
  group_by(Time) %>%
  drop_na() %>%
  summarise(mean_intensity = mean(TotalIntensity))

ggplot(data=grouped_hourly_intensities, aes(x=Time, y=mean_intensity)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Total Intensity vs. Time", y="Average Total Intensity")

grouped_hourly_calories <- hourly_calories %>%
  group_by(Time) %>%
  drop_na() %>%
  summarise(mean_calories = mean(Calories))

ggplot(data=grouped_hourly_calories, aes(x=Time, y=mean_calories)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Calories vs. Time", y="Mean Calories")
```
## Correlations
```{r}
# Correlations with p-values
# Sedentary vs Sleep
cor_test_sed_sleep <- cor.test(activity_sleep$SedentaryMinutes, activity_sleep$TotalMinutesAsleep)
print(cor_test_sed_sleep)  # r and p-value

# Steps vs Sedentary
cor_test_steps_sed <- cor.test(daily_activity$TotalSteps, daily_activity$SedentaryMinutes)
print(cor_test_steps_sed)

# Steps vs Calories
cor_test_steps_cal <- cor.test(daily_activity$TotalSteps, daily_activity$Calories)
print(cor_test_steps_cal)

# Sleep vs Time in Bed
cor_test_sleep_bed <- cor.test(sleep_day$TotalMinutesAsleep, sleep_day$TotalTimeInBed)
print(cor_test_sleep_bed)

# Sleep efficiency (ratio) for insomnia outliers
sleep_day$SleepEfficiency <- sleep_day$TotalMinutesAsleep / sleep_day$TotalTimeInBed
mean_eff <- mean(sleep_day$SleepEfficiency, na.rm = TRUE)
std_eff <- sd(sleep_day$SleepEfficiency, na.rm = TRUE)
outliers <- sleep_day[sleep_day$SleepEfficiency < (mean_eff - 2 * std_eff), ]
print(paste("Mean Efficiency:", round(mean_eff, 3), "Std:", round(std_eff, 3), "Outliers:", nrow(outliers)))

# Visualize efficiency
ggplot(sleep_day, aes(x = SleepEfficiency)) + geom_histogram(binwidth = 0.01, color="darkblue", fill="gray") + 
  labs(title = "Distribution of Sleep Efficiency", y = "Number of Sleep Records", x="Sleep Efficiency")
```
## Data Limitations
- Sample size: Only 33 unique IDs
- No gender/age: Limits applicability to Bellabeat's female users.
- Bias: Self-selected participants; weight logging low due to manual entry.

## Key Insights From Data Analysis:
1) 5pm-7pm are the most active hours of the day as confirmed by the comparisons of Average Total Intensity and Calories burned to Time. 
2) 12pm-2pm is the second most active time period.  
3) People with higher sedentary minutes had less total minutes asleep.  
4) Total sleep minutes increase with total time in bed so outliers that spend more time in bed but get less sleep may be having insomnia or poor sleep hygiene. 
5) People that took the most steps generally burned the most calories
6) There is a unique relationship between total steps and sedentary minutes. Sedentary minutes decrease with more total steps up to about 10,000 steps.  There is an inflection point here where more stpes after 10,000 tends to lead to an increase in sedentary minutes which may indicate fatigue or needed rest. 

## Recomendations:
1) Send activity related notification reminders to users to be active prior to the 5pm-7pm most active window, especially if they have not been active earlier in the day.
2) Send notifications to people with less total minutes slept to improve sleep by being more active throughout the day. 
3) Send notifications to outlying users who spend more time in bed but get less sleep than the average with sleep hygiene tips since they might be experiencing insomnia. 
4) Send notifications to people that exceed 10,000 steps per day to remind them that they might need more recovery and rest activities. 

