``` install.packages("ggplot2") install.packages("readxl") install.packages("dplyr") install.packages("tidyr") library(ggplot2) # For making nice charts library(readxl) # For reading Excel files library(dplyr) # For cleaning and summarizing data library(tidyr) # For organizing data (if needed) df <- read_excel("Airbnbc.xlsx") head(df, n= 5) print(df) summary(df) ``` # Step 2: Answer Each Question with Specified Chart Types # Question 1.1: Do private rooms cost more than shared rooms? (Column Chart) ``` private_shared <- filter(df, room_type %in% c("Private room", "Shared room")) private_shared_summary <- summarise( group_by(private_shared, room_type), avg_price = mean(price), count_listings = n() ) ``` # Create a column chart ``` ggplot(private_shared_summary, aes(x = room_type, y = avg_price, fill = room_type)) + geom_col() + labs(title = "Average Price: Private vs Shared Rooms", x = "Room Type", y = "Average Price") + theme(legend.position = "none") ``` # Question 1.2: Does the number of accommodates affect the price of the rented space? (Line Plot) ``` accommodates_summary <- summarise( group_by(df, no_of_accommodates), avg_price = mean(price) ) ``` # Create a line plot ``` ggplot(accommodates_summary, aes(x = no_of_accommodates, y = avg_price)) + geom_line(color = "red", size = 1) + geom_point(color = "red", size = 3) + labs(title = "Average Price Trend by Number of Accommodates", x = "Number of Accommodates", y = "Average Price") + theme_minimal() ``` # Question 1.3: Does the number of bathrooms affect the price? (Bar Plot) ``` bathrooms_summary <- summarise( group_by(df, no_of_bathrooms), avg_price = mean(price) ) ``` # Create a bar plot ``` ggplot(bathrooms_summary, aes(x = no_of_bathrooms, y = avg_price)) + geom_bar(stat = "identity", fill = "blue") + labs(title = "Average Price by Number of Bathrooms", x = "Number of Bathrooms", y = "Average Price") + theme_minimal() ``` # Question 1.4: Does the number of bedrooms affect the price? (Bar Chart) ``` bedrooms_summary <- summarise( group_by(df, no_of_bedrooms), avg_price = mean(price) ) ``` # Create a bar chart ``` ggplot(bedrooms_summary, aes(x = no_of_bedrooms, y = avg_price)) + geom_bar(stat = "identity", fill = "green") + labs(title = "Average Price by Number of Bedrooms", x = "Number of Bedrooms", y = "Average Price") + theme_minimal() ``` # Question 2.1: Do rooms with a flexible cancellation policy cost more than strict policy? (Column Chart) ``` flex_strict <- filter(df, cancellation_policy %in% c("flexible", "strict")) flex_strict_summary <- summarise( group_by(flex_strict, cancellation_policy), avg_price = mean(price), count_listings = n() ) ``` # Create a column chart ``` ggplot(flex_strict_summary, aes(x = cancellation_policy, y = avg_price, fill = cancellation_policy)) + geom_col() + labs(title = "Average Price by Cancellation Policy", x = "Cancellation Policy", y = "Average Price") + theme(legend.position = "none") ``` # Question 2.2: Does instant booking of available rooms cost more? (Column Chart) ``` instant_booking <- filter(df, instant_bookable %in% c("t", "f")) instant_booking_summary <- summarise( group_by(instant_booking, instant_bookable), avg_price = mean(price), count_listings = n() ) ``` # Create a column chart ``` ggplot(instant_booking_summary, aes(x = instant_bookable, y = avg_price, fill = instant_bookable)) + geom_col() + labs(title = "Average Price by Instant Bookable", x = "Instant Bookable (t = Yes, f = No)", y = "Average Price") + theme(legend.position = "none") ``` # Question 2.3: Do rooms with cleaning fees included cost more? (Column Chart) ``` cleaning_fee_summary <- summarise( group_by(df, cleaning_fee), avg_price = mean(price), count_listings = n() ) ``` # Create a column chart ``` ggplot(cleaning_fee_summary, aes(x = cleaning_fee, y = avg_price, fill = cleaning_fee)) + geom_col() + labs(title = "Average Price by Cleaning Fee", x = "Cleaning Fee Included", y = "Average Price") + theme(legend.position = "none") ``` # Question 2.4: Do rooms with high ratings cost more? (Scatter Plot) ``` df$rating_category <- ifelse(df$review_scores_rating > 80, "High", "Low") ggplot(df, aes(x = review_scores_rating, y = price, color = rating_category)) + geom_point(size = 3) + labs(title = "Review Scores Rating vs Price", x = "Review Score", y = "Price", color = "Rating Category (>80 = High, ≤80 = Low)") + theme_minimal() ``` # Question 3.1: Do most shared rooms have flexible cancellation policy? (Pie Chart) ``` shared_rooms <- filter(df, room_type == "Shared room") cancellation_summary <- table(shared_rooms$cancellation_policy) cancellation_df <- as.data.frame(cancellation_summary) colnames(cancellation_df) <- c("Cancellation_Policy", "Count") ``` # Create a pie chart ``` ggplot(cancellation_df, aes(x = "", y = Count, fill = Cancellation_Policy)) + geom_bar(stat = "identity", width = 1) + coord_polar("y", start = 0) + labs(title = "Cancellation Policy for Shared Rooms", fill = "Cancellation Policy") + theme_void() ``` # Question 3.2: Are private rooms significantly more expensive than shared rooms? (Column Chart) ``` ggplot(private_shared_summary, aes(x = room_type, y = avg_price, fill = room_type)) + geom_col() + labs(title = "Average Price: Private vs Shared Rooms", x = "Room Type", y = "Average Price") + theme(legend.position = "none") ``` # Question 3.3: Is there any relationship between accommodates and cancellation policy? (Boxplot) ``` ggplot(df, aes(x = cancellation_policy, y = no_of_accommodates, fill = cancellation_policy)) + geom_boxplot() + labs(title = "Number of Accommodates by Cancellation Policy", x = "Cancellation Policy", y = "Number of Accommodates") + theme(legend.position = "none") ``` # Question 3.4: Are most instantly bookable rooms flexible? (Pie Chart) ``` instant_rooms <- filter(df, instant_bookable == "t") instant_summary <- table(instant_rooms$cancellation_policy) instant_df <- as.data.frame(instant_summary) colnames(instant_df) <- c("Cancellation_Policy", "Count") ``` # Create a pie chart ``` ggplot(instant_df, aes(x = "", y = Count, fill = Cancellation_Policy)) + geom_bar(stat = "identity", width = 1) + coord_polar("y", start = 0) + labs(title = "Proportion of Cancellation Policy for Instantly Bookable Rooms", fill = "Cancellation Policy") + theme_void() ``` # Question 3.5: Does price increase for shared rooms as accommodates increase? (Line Plot) ``` shared_rooms_accommodates <- filter(df, room_type == "Shared room") shared_accommodates_summary <- summarise( group_by(shared_rooms_accommodates, no_of_accommodates), avg_price = mean(price) ``` # How many rows and columns? ``` print("Number of Rows and Columns:") dim(df) ``` # What are the columns and their types? ``` print("Column Names and Types:") str(df) ``` # Are there any empty spots in the data? ``` print("Number of Missing Values in Each Column:") colSums(is.na(df)) ``` # Create new features for deeper insights # Price per accommodate ``` df$price_per_accommodate <- df$price / df$no_of_accommodates print("Summary of Price Per Accommodate:") summary(df$price_per_accommodate) ``` # High rating indicator (rating > 90) ``` df$high_rating <- ifelse(df$review_scores_rating > 90, 1, 0) print("Count of High-Rated Listings (Rating > 90):") table(df$high_rating) ``` # Compare average price for different room types ``` print("Average Price by Room Type:") room_type_avg <- aggregate(price ~ room_type, data = df, FUN = mean) print(room_type_avg) ``` # Compare average price for cancellation policies ``` print("Average Price by Cancellation Policy:") policy_avg <- aggregate(price ~ cancellation_policy, data = df, FUN = mean) print(policy_avg) ``` # Compare average price for instant bookable ``` print("Average Price by Instant Bookable:") instant_avg <- aggregate(price ~ instant_bookable, data = df, FUN = mean) print(instant_avg) ``` # Correlation Analysis ``` correlation <- cor.test(df$price, df$no_of_accommodates, method = "pearson") print(correlation) ```