ENV
===
set hostnames
---
```
# vim /etc/hosts
10.18.30.43 atw-lab-pc1-vm1
10.18.30.42 atw-lab-pc1-vm2
```
config ssh
---
*ssh-keygen*
*ssh-copy-id -i ~/.ssh/id_rsa.pub hdoop@atw-lab-pc1-vm1* (all nodes)
(set firewall)
---
JAVA
---
java-8-openjdk-amd64
install
---
```
sudo apt install openjdk-8-jdk
```
set JAVA home path
---
```
# ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
```
HADOOP
===
Hadoop 3.2.2
set hadoop-env. sh
---
```
# /usr/local/hadoop/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export HADOOP_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/native"
export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}
```
set hdfs-site.xml
---
*
mkdir -m 777 /usr/local/hadoop/data/nameNode
mkdir -m 777 /usr/local/hadoop/data/dataNode
*
```
# /usr/local/hadoop/etc/hadoop/hdfs-site.xml
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>/usr/local/hadoop/data/nameNode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/usr/local/hadoop/data/dataNode</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/usr/local/hadoop/etc/hadoop/excludes</value>
</property>
</configuration>
```
set core-site.xml
---
*mkdir -m 777 /home/hdoop/tmpdata*
```
# /usr/local/hadoop/etc/hadoop/core-site.xml
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/hdoop/tmpdata</value>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://atw-lab-pc1-vm1:9000</value>
</property>
</configuration>
```
set workers
---
```
# /usr/local/hadoop/etc/hadoop/workers
atw-lab-pc1-vm1
atw-lab-pc1-vm2
```
set yarn-site.xml
---
```
# /usr/local/hadoop/etc/hadoop/yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>atw-lab-pc1-vm1</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
```
set Hadoop env path
---
```
# ~/.bashrc
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HBASE_HOME/bin
export HADOOP_CLASSPATH=$JAVA_HOME/lib/tools.jar
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"
```
format and start
---
*source ~/.bashrc*
*hdfs namenode -format*
*start-all.sh*
SPARK
===
spark 3.1.1
scala 2.12.10
Download Scale and Spark
---
Hadoop 3.2+ supports Spark 3.1.1
Spark 3.0+ pre-built with Scala 2.12
```
cd /usr/local/spark_source
# scala
wget https://downloads.lightbend.com/scala/2.12.13/scala-2.12.13.tgz
# spark
wget https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
```
install Scala (ALL NODES)
---
uncompress
```
# scala
tar -zxvf scala-2.12.13.tgz
# spark
tar -zxvf spark-3.1.1-bin-hadoop3.2.tgz
```
build a soft link
```
sudo ln -f -s /usr/local/spark_source/scala-2.12.13 /usr/lib/scala
```
Install Spark (ALL NODES)
---
move spark folder to /usr/local/spark
```
sudo mv /usr/local/spark_source/spark-3.1.1-bin-hadoop3.2 /usr/local/spark
```
set config
```
cd /usr/local/spark/conf/
mv spark-env.sh.template spark-env.sh
mv workers.template workers
mv spark-defaults.conf.template spark-defaults.conf
```
```
# vim spark-env.sh
export SCALA_HOME=/usr/lib/scala
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_MASTER_HOST=atw-lab-pc1-vm1
export SPARK_WORKER_MEMORY=32G
export SPARK_DAEMON_MEMORY=4G
```
```
# vim workers
loaclhost (delete)
atw-lab-pc1-vm1
atw-lab-pc1-vm2
```
```
# vim spark-defaults.conf
spark.master spark://atw-lab-pc1-vm1:7077
spark.eventLog.enabled true
spark.eventLog.dir hdfs://atw-lab-pc1-vm1:9000/user/hdoop/log
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 4g
spark.executor.memory 6g
spark.executor.cores 2
spark.dynamicAllocation.maxExecutors 2
spark.ui.enabled true
```
Set Spark env path
---
```
# vim ~/.bashrc
export SCALA_HOME=/usr/lib/scala
export PATH=$PATH:$SCALA_HOME/bin
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin
# source ~/.bashrc
```
Start Spark
---
*cd /usr/local/spark
./sbin/start-all.sh*
HIVE
===
install hive
---
*tar -zxvf apache-hive-3.1.2-bin.tar.gz
sudo mv apache-hive-3.1.2-bin /usr/local/hive*
Setup environment variables
---
```
#vim ~/.bashrc
export HIVE_HOME=/usr/local/hive
export PATH=$HIVE_HOME/bin:$PATH
export HIVE_CONF_DIR=/usr/local/hive/conf
#source ~/.bashrc
```
Setup Hive HDFS folders
---
```
hadoop fs -mkdir /tmp
hadoop fs -mkdir -p /user/hive/warehouse
hadoop fs -chmod g+w /tmp
hadoop fs -chmod g+w /user/hive/warehouse
hdfs dfs -mkdir /user/hdoop/log
```
install MySQL(only master)
---
```
sudo apt update
sudo apt install mysql-server (-y)
sudo mysql
mysql> CREATE USER 'hive'@'localhost' IDENTIFIED BY 'hive' password expire never;
mysql> GRANT ALL ON *.* TO 'hive'@'localhost';
mysql> exit;
mysql -u hive -phive
mysql> create database hive_metastore;
# Query OK, 1 row affected (0.00 sec)
mysql> show databases;
# After these steps, the following database objects are created:
# user: hive
# password: hive
# database: hive_metastore
```
Download MySQL JDBC driver
---
wget https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.22/mysql-connector-java-8.0.22.jar
mv mysql-connector-java-8.0.22.jar /usr/local/hive/lib/
cp /usr/local/hive/conf/hive-default.xml.template /usr/local/hive/conf/hive-site.xml
guava lib issue
---
rm $HIVE_HOME/lib/guava-19.0.jar
cp $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $HIVE_HOME/lib/
config and run hive
---
```
# vim /usr/local/hive/conf/hive-site.xml
javax.jdo.option.ConnectionDriverName: com.mysql.cj.jdbc.Driver
javax.jdo.option.ConnectionURL: jdbc:mysql://localhost/hive_metastore
javax.jdo.option.ConnectionUserName: hive
javax.jdo.option.ConnectionPassword: hive
hive.metastore.uris: thrift://127.0.0.1:9083
hive.metastore.db.type: mysql
<property>
<name>system:java.io.tmpdir</name>
<value>/tmp/hive/java</value>
</property>
<property>
<name>system:user.name</name>
<value>${user.name}</value>
</property>
schematool -dbType mysql -initSchema
/usr/local/hive/bin/hive --service metastore &
```
### if change location:
hive --service metatool -updateLocation hdfs://atw-lab-pc1-vm3:9000 hdfs://atw-lab-pc1-vm1:9000
Docker
===
```
sudo apt-get update
sudo apt-get install \
apt-transport-https \
ca-certificates \
curl \
gnupg \
lsb-release
curl -x "http://user:password@10.18.131.1:8080" -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo \
"deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install docker-ce docker-ce-cli containerd.
udo systemctl start docker
# set proxy
sudo vim /etc/systemd/system/docker.service.d/http-proxy.conf
[Service]
Environment="HTTP_PROXY=http://labpc18:hmy4q6s01KyqR6L7hWh3@10.18.131.1:8080"
Environment="HTTPS_PROXY=http://labpc18:hmy4q6s01KyqR6L7hWh3@10.18.131.1:8080"
sudo systemctl daemon-reload
sudo systemctl restart docker
```
KAFKA
===
```
# vim docker-compose.yaml
environment:
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,CONNECTIONS_FROM_HOST://atw-lab-pc1-vm3:19092
```
docker-compose up -d
docker exec -it poc_test_kafka_1 bash
/opt/kafka/bin/kafka-topics.sh --bootstrap-server localhost:9092 --create --replication-factor 1 --partitions 4 --topic poc
./src/sendRMQSrcConfig.sh localhost:8083
Python
===
kafka-python
hdfs
pyspark
~~watchdog~~
~~pika~~
uvicorn
fastapi
sudo apt install python3-pip
pip3 install pyspark --proxy=http://labpc18:hmy4q6s01KyqR6L7hWh3@10.18.131.1:8080
sudo apt install docker-compose
vim consumer.py
vim consumer_hive.py
```
hdfsclient = InsecureClient('http://atw-lab-pc1-vm1:9870', user='hdoop')
```
DB
---
Metadata
```
spark.sql("CREATE TABLE message_type (message_type STRING)")
spark.sql("INSERT INTO message_type VALUES ('motion'), ('qa'), ('predict')")
```

```
# spark.sql("CREATE TABLE motion (equipment_model STRING, module STRING, column_num INT, definition STRING)")
spark.sql("CREATE TABLE motion_meta (version INT, equip_model STRING, equip_module STRING, definition STRING)")
#spark.sql("INSERT OVERWRITE motion VALUES ('wb', 'bs', 292, 'feature6 int,feature7 string,feature8 int,feature9 double,feature10 double,feature11 double,feature12 double,feature13 double,feature14 double,feature15 double,feature16 double,feature17 double,feature18 double,feature19 double,feature20 double,feature21 double,feature22 double,feature23 double,feature24 double,feature25 double,feature26 double,feature27 int,feature28 double,feature29 int,feature30 int,feature31 int,feature32 int,feature33 int,feature34 int,feature35 int,feature36 int,feature37 int,feature38 int,feature39 int,feature40 string,feature41 double,feature42 double,feature43 int,feature44 int,feature45 int,feature46 int,feature47 double,feature48 int,feature49 int,feature50 int,feature51 int,feature52 int,feature53 int,feature54 int,feature55 int,feature56 int,feature57 int,feature58 int,feature59 int,feature60 int,feature61 int,feature62 int,feature63 int,feature64 int,feature65 int,feature66 int,feature67 int,feature68 int,feature69 int,feature70 int,feature71 int,feature72 int,feature73 int,feature74 int,feature75 int,feature76 int,feature77 int,feature78 int,feature79 int,feature80 int,feature81 int,feature82 int,feature83 int,feature84 int,feature85 int,feature86 int,feature87 int,feature88 double,feature89 double,feature90 double,feature91 double,feature92 double,feature93 double,feature94 double,feature95 double,feature96 double,feature97 double,feature98 int,feature99 int,feature100 int,feature101 int,feature102 int,feature103 int,feature104 int,feature105 int,feature106 int,feature107 int,feature108 int,feature109 int,feature110 int,feature111 int,feature112 int,feature113 double,feature114 double,feature115 int,feature116 int,feature117 int,feature118 int,feature119 int,feature120 int,feature121 int,feature122 int,feature123 int,feature124 int,feature125 double,feature126 double,feature127 double,feature128 double,feature129 double,feature130 double,feature131 double,feature132 double,feature133 double,feature134 double,feature135 int,feature136 int,feature137 int,feature138 int,feature139 int,feature140 int,feature141 int,feature142 int,feature143 int,feature144 int,feature145 int,feature146 int,feature147 int,feature148 int,feature149 int,feature150 double,feature151 double,feature152 int,feature153 int,feature154 int,feature155 int,feature156 int,feature157 int,feature158 int,feature159 int,feature160 int,feature161 int,feature162 double,feature163 double,feature164 double,feature165 double,feature166 double,feature167 double,feature168 double,feature169 double,feature170 double,feature171 double,feature172 int,feature173 int,feature174 int,feature175 int,feature176 int,feature177 int,feature178 int,feature179 int,feature180 int,feature181 int,feature182 int,feature183 int,feature184 int,feature185 int,feature186 int,feature187 double,feature188 double,feature189 int,feature190 int,feature191 int,feature192 int,feature193 int,feature194 int,feature195 int,feature196 int,feature197 int,feature198 int,feature199 double,feature200 double,feature201 double,feature202 double,feature203 double,feature204 double,feature205 double,feature206 double,feature207 double,feature208 double,feature209 int,feature210 int,feature211 int,feature212 int,feature213 int,feature214 int,feature215 int,feature216 int,feature217 int,feature218 int,feature219 int,feature220 int,feature221 int,feature222 int,feature223 int,feature224 double,feature225 double,feature226 int,feature227 int,feature228 int,feature229 int,feature230 int,feature231 int,feature232 int,feature233 int,feature234 int,feature235 int,feature236 double,feature237 double,feature238 double,feature239 double,feature240 double,feature241 double,feature242 double,feature243 double,feature244 double,feature245 double,feature246 int,feature247 int,feature248 int,feature249 int,feature250 int,feature251 int,feature252 int,feature253 int,feature254 int,feature255 int,feature256 int,feature257 int,feature258 int,feature259 int,feature260 int,feature261 double,feature262 double,feature263 int,feature264 int,feature265 int,feature266 int,feature267 int,feature268 int,feature269 int,feature270 int,feature271 int,feature272 double,feature273 double,feature274 int,feature275 int,feature276 int,feature277 double,feature278 double,feature279 int,feature280 int,feature281 int,feature282 double,feature283 double,feature284 double,feature285 double,feature286 double,feature287 double,feature288 double,feature289 double')")
spark.sql("INSERT OVERWRITE motion_meta VALUES (0, 'testing', 'testing','{'grp_1':{'feat1': 'int'}}')")
spark.sql("select * from motion").show()
```

```
spark.sql("CREATE TABLE IF NOT EXISTS row_strip_detail (strip_detail_uid STRING, lf_uid STRING, unit_no INT, row_no INT, col_no INT, group_no INT, die_no INT, wire_no INT)")
spark.sql("CREATE TABLE IF NOT EXISTS measure_item (ms_item_uid int, measure_item string, type string)")
spark.sql("CREATE TABLE IF NOT EXISTS leadframe (lf_uid string, lf_id string, im_uid string)")
spark.sql("CREATE TABLE IF NOT EXISTS input_magazine (im_uid string, input_mag_no int, l_uid string)")
spark.sql("CREATE TABLE IF NOT EXISTS lot (l_uid string, lot_id string, rp_uid string, mc_uid string)")
spark.sql("CREATE TABLE IF NOT EXISTS machine (mc_uid string, machine_id string)")
spark.sql("CREATE TABLE IF NOT EXISTS recipe (rp_uid string, recipe_name string)")
```
```
#delete
spark.sql("drop table bs_motion")
spark.sql("drop table row_strip_detail ")
spark.sql("drop table measure_item")
spark.sql("drop table leadframe ")
spark.sql("drop table input_magazine ")
spark.sql("drop table lot ")
spark.sql("drop table machine ")
spark.sql("drop table recipe ")
```
```
spark.sql("INSERT INTO measure_item VALUES (0,'ball_shear','wire'),(1,'wire_pull','wire'),(2,'neck_pull','wire'),(3,'stitch_pull','wire'),(4,'inner_ball_size_x','wire'),(5,'inner_ball_size_y','wire'),(6,'placement','die'),(7,'tilt','die')")
```
```
spark.sql("CREATE TABLE IF NOT EXISTS qa_item (
STRING, ms_item_uid STRING, strip_detail_uid STRING, rp_uid STRING, l_uid STRING, mc_uid STRING, value INT, result STRING, spec_lcl INT, spec_ucl INT, spec_lsl INT, spec_usl INT) STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS inner_ball_size_x (date STRING, strip_detail_uid STRING, rp_uid STRING, l_uid STRING, mc_uid STRING, value INT, result STRING, spec_lcl INT, spec_ucl INT, spec_lsl INT, spec_usl INT) STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS inner_ball_size_y (date STRING, strip_detail_uid STRING, rp_uid STRING, l_uid STRING, mc_uid STRING, value INT, result STRING, spec_lcl INT, spec_ucl INT, spec_lsl INT, spec_usl INT) STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS ball_shear (date STRING, strip_detail_uid STRING, rp_uid STRING, l_uid STRING, mc_uid STRING, value INT, result STRING, spec_lcl INT, spec_ucl INT, spec_lsl INT, spec_usl INT) STORED AS ORC")
---
# delete
spark.sql("drop table inner_ball_size_x ")
spark.sql("drop table inner_ball_size_y ")
spark.sql("drop table ball_shear ")
```
```
spark.sql("CREATE TABLE IF NOT EXISTS ball_shear (value INT, result STRING, spec_lcl INT, spec_ucl INT, spec_lsl INT, spec_usl INT) PARTITIONED BY (date STRING, strip_detail_uid STRING, rp_uid STRING, l_uid STRING, mc_uid STRING) CLUSTERED BY (value, result) INTO 4 BUCKETS STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS qa_wire_based_stats (wire_no INT, pass_num INT, reject_num INT, min FLOAT, max FLOAT, bin_list ARRAY<ARRAY<FLOAT>>, bin_count ARRAY<FLOAT>) PARTITIONED BY (measure_item STRING, date INT, hour INT, machine_id STRING, recipe_name STRING, lot_id STRING) CLUSTERED BY (wire_no) INTO 100 BUCKETS STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS qa_unit_based_stats (unit_no INT, pass_num INT, reject_num INT, min FLOAT, max FLOAT, bin_list ARRAY<ARRAY<FLOAT>>, bin_count ARRAY<FLOAT>) PARTITIONED BY (measure_item STRING, date INT, hour INT, machine_id STRING, recipe_name STRING, lot_id STRING) CLUSTERED BY (unit_no) INTO 100 BUCKETS STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS predict_wire_based_stats (wire_no INT, pass_num INT, reject_num INT, min FLOAT, max FLOAT, bin_list ARRAY<ARRAY<FLOAT>>, bin_count ARRAY<FLOAT>) PARTITIONED BY (model_version INT, measure_item STRING, date INT, hour INT, machine_id STRING, recipe_name STRING, lot_id STRING) CLUSTERED BY (wire_no) INTO 100 BUCKETS STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS predict_unit_based_stats (unit_no INT, pass_num INT, reject_num INT, min FLOAT, max FLOAT, bin_list ARRAY<ARRAY<FLOAT>>, bin_count ARRAY<FLOAT>) PARTITIONED BY (model_version INT, measure_item STRING, date INT, hour INT, machine_id STRING, recipe_name STRING, lot_id STRING) CLUSTERED BY (unit_no) INTO 100 BUCKETS STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS anomaly_bb_stats (pass_num INT, reject_num INT) PARTITIONED BY (model_version INT, date INT, hour INT, machine_id STRING, recipe_name STRING, lot_id STRING) STORED AS ORC")
spark.sql("CREATE TABLE IF NOT EXISTS anomaly_bs_stats (pass_num INT, reject_num INT) PARTITIONED BY (model_version INT, date INT, hour INT, machine_id STRING, recipe_name STRING, lot_id STRING) STORED AS ORC")
```
Migrate