Load installed packages
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)
library(tidyr)
Load CSV files
daily_activity <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
sleep_day <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
hourly_intensities <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv")
hourly_calories <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv")
weight_log <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")
Review head data of loaded CSV files
head(daily_activity)
head(sleep_day)
head(hourly_intensities)
head(hourly_calories)
head(weight_log)
Review column names of loaded CSV files
colnames(daily_activity)
[1] "Id" "ActivityDate"
[3] "TotalSteps" "TotalDistance"
[5] "TrackerDistance" "LoggedActivitiesDistance"
[7] "VeryActiveDistance" "ModeratelyActiveDistance"
[9] "LightActiveDistance" "SedentaryActiveDistance"
[11] "VeryActiveMinutes" "FairlyActiveMinutes"
[13] "LightlyActiveMinutes" "SedentaryMinutes"
[15] "Calories"
colnames(sleep_day)
[1] "Id" "SleepDay" "TotalSleepRecords"
[4] "TotalMinutesAsleep" "TotalTimeInBed"
colnames(hourly_intensities)
[1] "Id" "ActivityHour" "TotalIntensity" "AverageIntensity"
colnames(hourly_calories)
[1] "Id" "ActivityHour" "Calories"
colnames(weight_log)
[1] "Id" "Date" "WeightKg" "WeightPounds"
[5] "Fat" "BMI" "IsManualReport" "LogId"
Fix date mismatches
The daily_activity contains just a date but the others contain a date
and time. I will need to seperate the date and time into their own
columns in these other data frames. Additionally, the data type for each
date column will need to be set to a date format as well:
sleep_day <- sleep_day %>%
mutate(
SleepDay = mdy_hms(SleepDay), # Parse datetime (adjust format if needed)
Date = as.Date(SleepDay),
)
head(sleep_day)
daily_activity <- daily_activity %>%
mutate(
ActivityDate = mdy(ActivityDate),
Date = as.Date(ActivityDate)
)
hourly_intensities <- hourly_intensities %>%
mutate(
ActivityHour = mdy_hms(ActivityHour), # Parse datetime (adjust format if needed)
Date = as.Date(ActivityHour),
Time = format(ActivityHour, "%H:%M")
)
hourly_calories <- hourly_calories %>%
mutate(
ActivityHour = mdy_hms(ActivityHour), # Parse datetime (adjust format if needed)
Date = as.Date(ActivityHour),
Time = format(ActivityHour, "%H:%M")
)
weight_log <- weight_log %>%
mutate(
Date = mdy_hms(Date), # Parse datetime (adjust format if needed)
Date = as.Date(Date)
)
Summary Statistics
How many unique participants are there in each dataframe?
n_distinct(daily_activity$Id)
[1] 33
n_distinct(sleep_day$Id)
[1] 24
n_distinct(hourly_intensities$Id)
[1] 33
n_distinct(hourly_calories$Id)
[1] 33
n_distinct(weight_log$Id)
[1] 8
How many observations are there in each dataframe?
nrow(daily_activity)
[1] 940
nrow(sleep_day)
[1] 413
nrow(hourly_intensities)
[1] 22099
nrow(hourly_calories)
[1] 22099
nrow(weight_log)
[1] 67
daily_activity dataframe:
daily_activity %>%
select(TotalSteps,
TotalDistance,
SedentaryMinutes,
Calories) %>%
summary()
TotalSteps TotalDistance SedentaryMinutes Calories
Min. : 0 Min. : 0.000 Min. : 0.0 Min. : 0
1st Qu.: 3790 1st Qu.: 2.620 1st Qu.: 729.8 1st Qu.:1828
Median : 7406 Median : 5.245 Median :1057.5 Median :2134
Mean : 7638 Mean : 5.490 Mean : 991.2 Mean :2304
3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.:1229.5 3rd Qu.:2793
Max. :36019 Max. :28.030 Max. :1440.0 Max. :4900
sleep_day dataframe:
sleep_day %>%
select(TotalSleepRecords,
TotalMinutesAsleep,
TotalTimeInBed) %>%
summary()
TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
Min. :1.000 Min. : 58.0 Min. : 61.0
1st Qu.:1.000 1st Qu.:361.0 1st Qu.:403.0
Median :1.000 Median :433.0 Median :463.0
Mean :1.119 Mean :419.5 Mean :458.6
3rd Qu.:1.000 3rd Qu.:490.0 3rd Qu.:526.0
Max. :3.000 Max. :796.0 Max. :961.0
hourly_intensities dataframe:
hourly_intensities %>%
select(TotalIntensity,
AverageIntensity) %>%
summary()
TotalIntensity AverageIntensity
Min. : 0.00 Min. :0.0000
1st Qu.: 0.00 1st Qu.:0.0000
Median : 3.00 Median :0.0500
Mean : 12.04 Mean :0.2006
3rd Qu.: 16.00 3rd Qu.:0.2667
Max. :180.00 Max. :3.0000
hourly_calories dataframe:
hourly_calories %>%
select(Calories) %>%
summary()
Calories
Min. : 42.00
1st Qu.: 63.00
Median : 83.00
Mean : 97.39
3rd Qu.:108.00
Max. :948.00
Joining sleep_day and daily_activity
activity_sleep <- full_join(sleep_day, daily_activity, by=c("Id", "Date"))
Verify Distinct Id Count After Join
n_distinct(activity_sleep$Id)
[1] 33
Visualization
ggplot(data=daily_activity, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point()+ geom_smooth()+ labs(title="Total Steps vs. Sedentary Minutes", y="Total Sedentary Minutes", x="Total Steps")

ggplot(data=daily_activity, aes(x=TotalSteps, y=Calories)) + geom_point() + geom_smooth()+ labs(title="Total Steps vs. Calories", x="Total Steps")

ggplot(data=sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point()+ geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Time In Bed", y="Total Time in Bed", x="Total Sleep Minutes")

ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=TotalSteps)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Steps", y="Total Steps", x="Total Sleep Minutes")

ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=SedentaryMinutes)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Sedentary Minutes", y="Total Sedentary Minutes", x="Total Sleep Minutes")

ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=Calories)) + geom_point() + geom_smooth() + labs(title="Total Sleep Minutes vs. Calories", x="Total Sleep Minutes")

grouped_hourly_intensities <- hourly_intensities %>%
group_by(Time) %>%
drop_na() %>%
summarise(mean_intensity = mean(TotalIntensity))
ggplot(data=grouped_hourly_intensities, aes(x=Time, y=mean_intensity)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Total Intensity vs. Time", y="Average Total Intensity")

grouped_hourly_calories <- hourly_calories %>%
group_by(Time) %>%
drop_na() %>%
summarise(mean_calories = mean(Calories))
ggplot(data=grouped_hourly_calories, aes(x=Time, y=mean_calories)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Calories vs. Time", y="Mean Calories")

Correlations
# Correlations with p-values
# Sedentary vs Sleep
cor_test_sed_sleep <- cor.test(activity_sleep$SedentaryMinutes, activity_sleep$TotalMinutesAsleep)
print(cor_test_sed_sleep) # r and p-value
Pearson's product-moment correlation
data: activity_sleep$SedentaryMinutes and activity_sleep$TotalMinutesAsleep
t = -15.181, df = 411, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.6578402 -0.5337719
sample estimates:
cor
-0.599394
# Steps vs Sedentary
cor_test_steps_sed <- cor.test(daily_activity$TotalSteps, daily_activity$SedentaryMinutes)
print(cor_test_steps_sed)
Pearson's product-moment correlation
data: daily_activity$TotalSteps and daily_activity$SedentaryMinutes
t = -10.615, df = 938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3833971 -0.2691782
sample estimates:
cor
-0.3274835
# Steps vs Calories
cor_test_steps_cal <- cor.test(daily_activity$TotalSteps, daily_activity$Calories)
print(cor_test_steps_cal)
Pearson's product-moment correlation
data: daily_activity$TotalSteps and daily_activity$Calories
t = 22.472, df = 938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5483688 0.6316184
sample estimates:
cor
0.5915681
# Sleep vs Time in Bed
cor_test_sleep_bed <- cor.test(sleep_day$TotalMinutesAsleep, sleep_day$TotalTimeInBed)
print(cor_test_sleep_bed)
Pearson's product-moment correlation
data: sleep_day$TotalMinutesAsleep and sleep_day$TotalTimeInBed
t = 51.483, df = 411, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9162253 0.9423445
sample estimates:
cor
0.9304575
# Sleep efficiency (ratio) for insomnia outliers
sleep_day$SleepEfficiency <- sleep_day$TotalMinutesAsleep / sleep_day$TotalTimeInBed
mean_eff <- mean(sleep_day$SleepEfficiency, na.rm = TRUE)
std_eff <- sd(sleep_day$SleepEfficiency, na.rm = TRUE)
outliers <- sleep_day[sleep_day$SleepEfficiency < (mean_eff - 2 * std_eff), ]
print(paste("Mean Efficiency:", round(mean_eff, 3), "Std:", round(std_eff, 3), "Outliers:", nrow(outliers)))
[1] "Mean Efficiency: 0.917 Std: 0.087 Outliers: 27"
# Visualize efficiency
ggplot(sleep_day, aes(x = SleepEfficiency)) + geom_histogram(binwidth = 0.01, color="darkblue", fill="gray") +
labs(title = "Distribution of Sleep Efficiency", y = "Number of Sleep Records", x="Sleep Efficiency")

Data Limitations
- Sample size: Only 33 unique IDs
- No gender/age: Limits applicability to Bellabeat’s female
users.
- Bias: Self-selected participants; weight logging low due to manual
entry.
Key Insights From Data Analysis:
- 5pm-7pm are the most active hours of the day as confirmed by the
comparisons of Average Total Intensity and Calories burned to Time.
- 12pm-2pm is the second most active time period.
- People with higher sedentary minutes had less total minutes
asleep.
- Total sleep minutes increase with total time in bed so outliers that
spend more time in bed but get less sleep may be having insomnia or poor
sleep hygiene.
- People that took the most steps generally burned the most
calories
- There is a unique relationship between total steps and sedentary
minutes. Sedentary minutes decrease with more total steps up to about
10,000 steps. There is an inflection point here where more stpes after
10,000 tends to lead to an increase in sedentary minutes which may
indicate fatigue or needed rest.
Recomendations:
- Send activity related notification reminders to users to be active
prior to the 5pm-7pm most active window, especially if they have not
been active earlier in the day.
- Send notifications to people with less total minutes slept to
improve sleep by being more active throughout the day.
- Send notifications to outlying users who spend more time in bed but
get less sleep than the average with sleep hygiene tips since they might
be experiencing insomnia.
- Send notifications to people that exceed 10,000 steps per day to
remind them that they might need more recovery and rest activities.
---
title: "BellaBeat Case Study"
output:
  html_notebook: default
  pdf_document: default
---
### Load installed packages
```{r}
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)
library(tidyr)
```
### Load CSV files
```{r}
daily_activity <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
sleep_day <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
hourly_intensities <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv")
hourly_calories <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv")
weight_log <- read.csv("C:/Users/cmor7/OneDrive/Desktop/Case Study/BellaBeat/Data/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")
```
### Review head data of loaded CSV files
```{r}
head(daily_activity)
head(sleep_day)
head(hourly_intensities)
head(hourly_calories)
head(weight_log)
```
### Review column names of loaded CSV files
```{r}
colnames(daily_activity)
colnames(sleep_day)
colnames(hourly_intensities)
colnames(hourly_calories)
colnames(weight_log)
```
### Fix date mismatches
The daily_activity contains just a date but the others contain a date and time. I will need to seperate the date and time into their own columns in these other data frames. Additionally, the data type for each date column will need to be set to a date format as well:
```{r}
sleep_day <- sleep_day %>% 
  mutate(
    SleepDay = mdy_hms(SleepDay),  # Parse datetime (adjust format if needed)
    Date = as.Date(SleepDay),
  )
head(sleep_day)

daily_activity <- daily_activity %>% 
  mutate(
    ActivityDate = mdy(ActivityDate),
    Date = as.Date(ActivityDate)
  )

hourly_intensities <- hourly_intensities %>% 
  mutate(
    ActivityHour = mdy_hms(ActivityHour),  # Parse datetime (adjust format if needed)
    Date = as.Date(ActivityHour),
    Time = format(ActivityHour, "%H:%M")
  )

hourly_calories <- hourly_calories %>% 
  mutate(
    ActivityHour = mdy_hms(ActivityHour),  # Parse datetime (adjust format if needed)
    Date = as.Date(ActivityHour),
    Time = format(ActivityHour, "%H:%M")
  )

weight_log <- weight_log %>% 
  mutate(
    Date = mdy_hms(Date),  # Parse datetime (adjust format if needed)
    Date = as.Date(Date)
  )
```
## Summary Statistics
### How many unique participants are there in each dataframe?
```{r}
n_distinct(daily_activity$Id)
n_distinct(sleep_day$Id)
n_distinct(hourly_intensities$Id)
n_distinct(hourly_calories$Id)
n_distinct(weight_log$Id)
```
### How many observations are there in each dataframe?
```{r}
nrow(daily_activity)
nrow(sleep_day)
nrow(hourly_intensities)
nrow(hourly_calories)
nrow(weight_log)
```
### daily_activity dataframe:
```{r}
daily_activity %>%  
  select(TotalSteps,
         TotalDistance,
         SedentaryMinutes,
         Calories) %>%
  summary()
```
### sleep_day dataframe:
```{r}
sleep_day %>%  
  select(TotalSleepRecords,
         TotalMinutesAsleep,
         TotalTimeInBed) %>%
  summary()
```
### hourly_intensities dataframe:
```{r}
hourly_intensities %>%  
  select(TotalIntensity,
         AverageIntensity) %>%
  summary()
```
### hourly_calories dataframe:
```{r}
hourly_calories %>%  
  select(Calories) %>%
  summary()
```
## Joining sleep_day and daily_activity
```{r}
activity_sleep <- full_join(sleep_day, daily_activity, by=c("Id", "Date"))
```
### Verify Distinct Id Count After Join
```{r}
n_distinct(activity_sleep$Id)
```
## Visualization 
```{r}
ggplot(data=daily_activity, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point()+ geom_smooth()+ labs(title="Total Steps vs. Sedentary Minutes", y="Total Sedentary Minutes", x="Total Steps")
ggplot(data=daily_activity, aes(x=TotalSteps, y=Calories)) + geom_point() + geom_smooth()+ labs(title="Total Steps vs. Calories", x="Total Steps")
ggplot(data=sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point()+ geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Time In Bed", y="Total Time in Bed", x="Total Sleep Minutes")
ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=TotalSteps)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Steps", y="Total Steps", x="Total Sleep Minutes")
ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=SedentaryMinutes)) + geom_point() + geom_smooth()+ labs(title="Total Sleep Minutes vs. Total Sedentary Minutes", y="Total Sedentary Minutes", x="Total Sleep Minutes")
ggplot(data = activity_sleep, aes(x=TotalMinutesAsleep, y=Calories)) + geom_point() + geom_smooth() + labs(title="Total Sleep Minutes vs. Calories", x="Total Sleep Minutes")

grouped_hourly_intensities <- hourly_intensities %>%
  group_by(Time) %>%
  drop_na() %>%
  summarise(mean_intensity = mean(TotalIntensity))

ggplot(data=grouped_hourly_intensities, aes(x=Time, y=mean_intensity)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Total Intensity vs. Time", y="Average Total Intensity")

grouped_hourly_calories <- hourly_calories %>%
  group_by(Time) %>%
  drop_na() %>%
  summarise(mean_calories = mean(Calories))

ggplot(data=grouped_hourly_calories, aes(x=Time, y=mean_calories)) + geom_col(fill='grey', color="darkblue") + theme(axis.text.x = element_text(angle = 45)) + labs(title="Average Calories vs. Time", y="Mean Calories")
```
## Correlations
```{r}
# Correlations with p-values
# Sedentary vs Sleep
cor_test_sed_sleep <- cor.test(activity_sleep$SedentaryMinutes, activity_sleep$TotalMinutesAsleep)
print(cor_test_sed_sleep)  # r and p-value

# Steps vs Sedentary
cor_test_steps_sed <- cor.test(daily_activity$TotalSteps, daily_activity$SedentaryMinutes)
print(cor_test_steps_sed)

# Steps vs Calories
cor_test_steps_cal <- cor.test(daily_activity$TotalSteps, daily_activity$Calories)
print(cor_test_steps_cal)

# Sleep vs Time in Bed
cor_test_sleep_bed <- cor.test(sleep_day$TotalMinutesAsleep, sleep_day$TotalTimeInBed)
print(cor_test_sleep_bed)

# Sleep efficiency (ratio) for insomnia outliers
sleep_day$SleepEfficiency <- sleep_day$TotalMinutesAsleep / sleep_day$TotalTimeInBed
mean_eff <- mean(sleep_day$SleepEfficiency, na.rm = TRUE)
std_eff <- sd(sleep_day$SleepEfficiency, na.rm = TRUE)
outliers <- sleep_day[sleep_day$SleepEfficiency < (mean_eff - 2 * std_eff), ]
print(paste("Mean Efficiency:", round(mean_eff, 3), "Std:", round(std_eff, 3), "Outliers:", nrow(outliers)))

# Visualize efficiency
ggplot(sleep_day, aes(x = SleepEfficiency)) + geom_histogram(binwidth = 0.01, color="darkblue", fill="gray") + 
  labs(title = "Distribution of Sleep Efficiency", y = "Number of Sleep Records", x="Sleep Efficiency")
```
## Data Limitations
- Sample size: Only 33 unique IDs
- No gender/age: Limits applicability to Bellabeat's female users.
- Bias: Self-selected participants; weight logging low due to manual entry.

## Key Insights From Data Analysis:
1) 5pm-7pm are the most active hours of the day as confirmed by the comparisons of Average Total Intensity and Calories burned to Time. 
2) 12pm-2pm is the second most active time period.  
3) People with higher sedentary minutes had less total minutes asleep.  
4) Total sleep minutes increase with total time in bed so outliers that spend more time in bed but get less sleep may be having insomnia or poor sleep hygiene. 
5) People that took the most steps generally burned the most calories
6) There is a unique relationship between total steps and sedentary minutes. Sedentary minutes decrease with more total steps up to about 10,000 steps.  There is an inflection point here where more stpes after 10,000 tends to lead to an increase in sedentary minutes which may indicate fatigue or needed rest. 

## Recomendations:
1) Send activity related notification reminders to users to be active prior to the 5pm-7pm most active window, especially if they have not been active earlier in the day.
2) Send notifications to people with less total minutes slept to improve sleep by being more active throughout the day. 
3) Send notifications to outlying users who spend more time in bed but get less sleep than the average with sleep hygiene tips since they might be experiencing insomnia. 
4) Send notifications to people that exceed 10,000 steps per day to remind them that they might need more recovery and rest activities. 

