```
install.packages("ggplot2")
install.packages("readxl")
install.packages("dplyr")
install.packages("tidyr")
library(ggplot2) # For making nice charts
library(readxl) # For reading Excel files
library(dplyr) # For cleaning and summarizing data
library(tidyr) # For organizing data (if needed)
df <- read_excel("Airbnbc.xlsx")
head(df, n= 5)
print(df)
summary(df)
```
# Step 2: Answer Each Question with Specified Chart Types
# Question 1.1: Do private rooms cost more than shared rooms? (Column Chart)
```
private_shared <- filter(df, room_type %in% c("Private room", "Shared room"))
private_shared_summary <- summarise(
group_by(private_shared, room_type),
avg_price = mean(price),
count_listings = n()
)
```
# Create a column chart
```
ggplot(private_shared_summary, aes(x = room_type, y = avg_price, fill = room_type)) +
geom_col() +
labs(title = "Average Price: Private vs Shared Rooms",
x = "Room Type",
y = "Average Price") +
theme(legend.position = "none")
```
# Question 1.2: Does the number of accommodates affect the price of the rented space? (Line Plot)
```
accommodates_summary <- summarise(
group_by(df, no_of_accommodates),
avg_price = mean(price)
)
```
# Create a line plot
```
ggplot(accommodates_summary, aes(x = no_of_accommodates, y = avg_price)) +
geom_line(color = "red", size = 1) +
geom_point(color = "red", size = 3) +
labs(title = "Average Price Trend by Number of Accommodates",
x = "Number of Accommodates",
y = "Average Price") +
theme_minimal()
```
# Question 1.3: Does the number of bathrooms affect the price? (Bar Plot)
```
bathrooms_summary <- summarise(
group_by(df, no_of_bathrooms),
avg_price = mean(price)
)
```
# Create a bar plot
```
ggplot(bathrooms_summary, aes(x = no_of_bathrooms, y = avg_price)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Average Price by Number of Bathrooms",
x = "Number of Bathrooms",
y = "Average Price") +
theme_minimal()
```
# Question 1.4: Does the number of bedrooms affect the price? (Bar Chart)
```
bedrooms_summary <- summarise(
group_by(df, no_of_bedrooms),
avg_price = mean(price)
)
```
# Create a bar chart
```
ggplot(bedrooms_summary, aes(x = no_of_bedrooms, y = avg_price)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Average Price by Number of Bedrooms",
x = "Number of Bedrooms",
y = "Average Price") +
theme_minimal()
```
# Question 2.1: Do rooms with a flexible cancellation policy cost more than strict policy? (Column Chart)
```
flex_strict <- filter(df, cancellation_policy %in% c("flexible", "strict"))
flex_strict_summary <- summarise(
group_by(flex_strict, cancellation_policy),
avg_price = mean(price),
count_listings = n()
)
```
# Create a column chart
```
ggplot(flex_strict_summary, aes(x = cancellation_policy, y = avg_price, fill = cancellation_policy)) +
geom_col() +
labs(title = "Average Price by Cancellation Policy",
x = "Cancellation Policy",
y = "Average Price") +
theme(legend.position = "none")
```
# Question 2.2: Does instant booking of available rooms cost more? (Column Chart)
```
instant_booking <- filter(df, instant_bookable %in% c("t", "f"))
instant_booking_summary <- summarise(
group_by(instant_booking, instant_bookable),
avg_price = mean(price),
count_listings = n()
)
```
# Create a column chart
```
ggplot(instant_booking_summary, aes(x = instant_bookable, y = avg_price, fill = instant_bookable)) +
geom_col() +
labs(title = "Average Price by Instant Bookable",
x = "Instant Bookable (t = Yes, f = No)",
y = "Average Price") +
theme(legend.position = "none")
```
# Question 2.3: Do rooms with cleaning fees included cost more? (Column Chart)
```
cleaning_fee_summary <- summarise(
group_by(df, cleaning_fee),
avg_price = mean(price),
count_listings = n()
)
```
# Create a column chart
```
ggplot(cleaning_fee_summary, aes(x = cleaning_fee, y = avg_price, fill = cleaning_fee)) +
geom_col() +
labs(title = "Average Price by Cleaning Fee",
x = "Cleaning Fee Included",
y = "Average Price") +
theme(legend.position = "none")
```
# Question 2.4: Do rooms with high ratings cost more? (Scatter Plot)
```
df$rating_category <- ifelse(df$review_scores_rating > 80, "High", "Low")
ggplot(df, aes(x = review_scores_rating, y = price, color = rating_category)) +
geom_point(size = 3) +
labs(title = "Review Scores Rating vs Price",
x = "Review Score",
y = "Price",
color = "Rating Category (>80 = High, ≤80 = Low)") +
theme_minimal()
```
# Question 3.1: Do most shared rooms have flexible cancellation policy? (Pie Chart)
```
shared_rooms <- filter(df, room_type == "Shared room")
cancellation_summary <- table(shared_rooms$cancellation_policy)
cancellation_df <- as.data.frame(cancellation_summary)
colnames(cancellation_df) <- c("Cancellation_Policy", "Count")
```
# Create a pie chart
```
ggplot(cancellation_df, aes(x = "", y = Count, fill = Cancellation_Policy)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
labs(title = "Cancellation Policy for Shared Rooms",
fill = "Cancellation Policy") +
theme_void()
```
# Question 3.2: Are private rooms significantly more expensive than shared rooms? (Column Chart)
```
ggplot(private_shared_summary, aes(x = room_type, y = avg_price, fill = room_type)) +
geom_col() +
labs(title = "Average Price: Private vs Shared Rooms",
x = "Room Type",
y = "Average Price") +
theme(legend.position = "none")
```
# Question 3.3: Is there any relationship between accommodates and cancellation policy? (Boxplot)
```
ggplot(df, aes(x = cancellation_policy, y = no_of_accommodates, fill = cancellation_policy)) +
geom_boxplot() +
labs(title = "Number of Accommodates by Cancellation Policy",
x = "Cancellation Policy",
y = "Number of Accommodates") +
theme(legend.position = "none")
```
# Question 3.4: Are most instantly bookable rooms flexible? (Pie Chart)
```
instant_rooms <- filter(df, instant_bookable == "t")
instant_summary <- table(instant_rooms$cancellation_policy)
instant_df <- as.data.frame(instant_summary)
colnames(instant_df) <- c("Cancellation_Policy", "Count")
```
# Create a pie chart
```
ggplot(instant_df, aes(x = "", y = Count, fill = Cancellation_Policy)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
labs(title = "Proportion of Cancellation Policy for Instantly Bookable Rooms",
fill = "Cancellation Policy") +
theme_void()
```
# Question 3.5: Does price increase for shared rooms as accommodates increase? (Line Plot)
```
shared_rooms_accommodates <- filter(df, room_type == "Shared room")
shared_accommodates_summary <- summarise(
group_by(shared_rooms_accommodates, no_of_accommodates),
avg_price = mean(price)
```
# How many rows and columns?
```
print("Number of Rows and Columns:")
dim(df)
```
# What are the columns and their types?
```
print("Column Names and Types:")
str(df)
```
# Are there any empty spots in the data?
```
print("Number of Missing Values in Each Column:")
colSums(is.na(df))
```
# Create new features for deeper insights
# Price per accommodate
```
df$price_per_accommodate <- df$price / df$no_of_accommodates
print("Summary of Price Per Accommodate:")
summary(df$price_per_accommodate)
```
# High rating indicator (rating > 90)
```
df$high_rating <- ifelse(df$review_scores_rating > 90, 1, 0)
print("Count of High-Rated Listings (Rating > 90):")
table(df$high_rating)
```
# Compare average price for different room types
```
print("Average Price by Room Type:")
room_type_avg <- aggregate(price ~ room_type, data = df, FUN = mean)
print(room_type_avg)
```
# Compare average price for cancellation policies
```
print("Average Price by Cancellation Policy:")
policy_avg <- aggregate(price ~ cancellation_policy, data = df, FUN = mean)
print(policy_avg)
```
# Compare average price for instant bookable
```
print("Average Price by Instant Bookable:")
instant_avg <- aggregate(price ~ instant_bookable, data = df, FUN = mean)
print(instant_avg)
```
# Correlation Analysis
```
correlation <- cor.test(df$price, df$no_of_accommodates, method = "pearson")
print(correlation)
```