https://drive.google.com/file/d/1QuDfDN7dIqb3op7uBF8E6i0dW_pfIYg2/view?usp=sharing
hadoop fs -mkdir -p /workshop/source_files/hive/
hadoop fs -copyFromLocal hive_load.csv /workshop/source_files/hive/
## D:\Personal\Big_Data_Webinar\VVCE\hive_datasets\hive_emp_load.txt
create table hive_emp_table(id INT, name string) row format delimited fields terminated by ',' stored as textfile;
load data inpath '/workshop/source_files/hive/hive_emp_load.txt' into table hive_emp_table;
SET hive.enforce.bucketing=true;
set hive.exec.dynamic.partition.mode=nonstrict;
hadoop fs -copyFromLocal orders_country_w_states.csv /workshop/source_files/hive/
create table orders_table_no_partition(order_id string, cust_id string, product_id string, order_amount float, zip int, country string,state string )
row format delimited fields terminated by ',' stored as textfile;
load data inpath '/workshop/source_files/hive/orders_country_w_states.csv' into table orders_table_no_partition;
create table orders_table_with_partition_only(order_id string, cust_id string, product_id string, order_amount float, zip int, country string )
PARTITIONED BY(state string)
row format delimited fields terminated by ',' stored as textfile;
insert into table orders_table_with_partition_only partition(state) select * from orders_table_no_partition;
hadoop fs -ls /user/hive/warehouse/orders_table_with_partition_only/
describe formatted orders_table_with_partition_only;
https://drive.google.com/file/d/1OH3jIn2a5Bd88ZN8IheMjfiQz63Nmw43/view?usp=sharing
####################### DAY 2 #######################################################
https://drive.google.com/file/d/1dnr-53WMAHpY7Ek6Y4w5rW263CGyj7j5/view?usp=sharing
#sample_code.py
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
.appName('spark_example') \
.getOrCreate()
print(spark)
df = spark.read.option('header',True).option("delimiter",",").csv(r"c://workshop/spark/sample_dataset/personal_transactions.csv")
df.show()
import pyspark.sql.functions as F
print(df.columns)
df2 = df.withColumn("Amount_Doubled",F.col("Amount")*2)
df2.show()
open the terminal inside psark_Dataset folder
type personal_transactions.csv >> personal_transactions.csv
print(df.rdd.getNumPartitions())
df2 = df.groupby("Category").count()
df2.show()
import sys
sys.stdin.readline()
http://localhost:4040/ Spark Server
from pyspark import StorageLevel
df.persist(StorageLevel.DISK_ONLY)
Filter out records with (Category value as 'Shopping' and Card type is 'Silver Card'. Write this df to your local storage under "c://workshop/spark_output/example_1/"
df2.repartition(1).write.parquet(r"c://workshop/spark_output/example_1/")
### Final Mini - project
Problem Statement: NBA Auction 2023 is about to happen. And the stakeholders would want you to build a data pipeline that would help them analyze the statistics based on prior matches data. To facilitate their request, you are requested to build a data pipeline using spark for following scenarios
Get the distinct TEAM_ID who has got at least one ‘1’ Winning Percentage (W_PCT) in the history. - 3000 Points
2) Fetch the GAME_ID, “Result” as (if HOME_TEAM_WINS has won then 'WON' else 'LOST'), of all the matches played by TEAM_CITY "Washington“ - 7000 Points
Hint: a) Use both games.csv and game_details.csv b) Use case statements
3) Fetch Player_Name, Player_ID information of players who have played in more than one seasons -10000 Points
HINT: Use One of the Window Functions