# 分析近期共機數量
#### 組員: 14 26 32
---
## Github專案
[tnfsh-cs-grade-12-project](https://github.com/tobiichi3227/tnfsh-cs-grade-12-project)
---
## 摘要
透過Python爬取空軍空情動態並繪製圖表分析數量
---
## 研究動機
我家住機場旁,飛機變多了,有點吵
想了解近期共機數量以及是什麼事件導致的
---
## 解放軍軍機與軍艦統計圖表

---
## 解放軍氣球與火箭發射統計圖表

---
## 各項時間對應事件分析
- 02/01 中方宣佈M503航線取消
- 02/14~03/01 海巡金門撞人
- 03/17~03/23 金門釣客落水
- 03/28 金馬水域進行實彈射擊
- 04/03 地震
---
## M503航線

---
## 程式架構

----
### 爬蟲
#### 解析網頁
```python=
# 空軍的網頁架構有點混亂,所以才出現大量的select_one
def get_air_activity(url) -> tuple[str, str]:
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > p > img')
if img is None:
img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > div > div > p > img')
if img is None:
img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > div > p > img')
if img is None:
img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > div > div > img')
if img is None:
plain = re.search(r"([\d]*)架次", html)
if plain is not None:
print(plain.group(1))
ship = re.search(r"([\d]*)艘次", html)
if ship is not None:
print(ship.group(1))
return None, None
date = re.search(r"([\d]*)", img['alt']).group()
img_url = img['src']
return date, AIRFORCE_URL + img_url[8:]
```
----
#### ASPX爬蟲核心
```python=
# ASPX的網頁每次會隨機設定__VIEWSTATE等值,而且Query需要這些參數,所以我們要從網頁中取出這些值以供下次Query使用
def get_aspx_hidden_value(url):
html = requests.get(url).text
viewstate = re.findall(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', html, re.I)
eventvalidation = re.findall(r'<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', html, re.I)
viewstategenerator = re.findall(r'input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="(.*?)" />', html, re.I)
return viewstate[0], eventvalidation[0], viewstategenerator[0]
```
----
#### 爬蟲主程式
```python=
def main():
for i in range(0, 4):
viewstate, eventvalidation, viewstategenerator = get_aspx_hidden_value(AIRFORCE_NEWS_LIST_URL)
data = {
"__EVENTTARGET": NEWS_INDEX_QUERY.format(str(i).zfill(2)),
"__EVENTVALIDATION": eventvalidation,
"__VIEWSTATE": viewstate,
"__VIEWSTATEGENERATOR": viewstategenerator,
"__EVENTARGUMENT": "",
"ctl00$txb_s": "",
}
html = requests.post(AIRFORCE_NEWS_LIST_URL, data=data, headers={
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13"
}).text
soup = BeautifulSoup(html, 'html.parser')
for link in soup.select("ul#ContentPlaceHolder1_TableList_ul_List > li > a"):
date, img_url = get_air_activity(AIRFORCE_URL + link['href'])
if date is None and img_url is None:
continue
img = get_img_from_url(img_url)
print(date, get_count_from_img(img))
```
----
### OCR與正則表達式
```python=
def get_count_from_img(img):
# Get text from image by using OCR
text = pytesseract.image_to_string(img)
text = text.replace('\n', ' ')
try:
f = r"([\d]*) PLA aircraft"
# Using regex to search data
plain = re.search(f, text)
plain_cnt = 0
if plain is not None and (cnt := plain.group(1)) != '':
plain_cnt = int(cnt)
f = r"([\d]*) PLAN vessels"
ship = re.search(f, text)
ship_cnt = 0
if ship is not None and (cnt := ship.group(1)) != '':
ship_cnt = int(cnt)
f = r"([\d]*) of the"
adiz = re.search(f, text)
adiz_cnt = 0
if adiz is None:
if re.search(r"One of the", text):
adiz_cnt = 1
elif re.search(r"Two of the", text):
adiz_cnt = 2
else:
cnt = adiz.group(1)
if cnt != '':
adiz_cnt = int(cnt)
except Exception as e:
import traceback
print(text, e)
traceback.print_exc()
return (plain_cnt, ship_cnt, adiz_cnt)
```
----
### 生成圖表
```python=
# 這個沒什麼技術,有上課的都畫的出來
import matplotlib.pyplot as plt
data = []
dates = []
for o in data:
dates.append(str(o["date"]))
values = []
for o in data:
v1, v2, v3 = o["values"]
values.append((v1, v2, v3))
# 將數據分離成3組值
plains = [v[0] for v in values]
ships = [v[1] for v in values]
enter_azid_plains = [v[2] for v in values]
plt.figure(figsize=(20, 10))
plt.plot(dates, plains, label="plane", marker="o")
plt.plot(dates, ships, label="ship", marker="o")
plt.plot(dates, enter_azid_plains, label="adiz", marker="o")
plt.xlabel("Date")
plt.ylabel("Values")
plt.title("Values Over Time")
# add icon
plt.legend()
# show result
plt.grid(True)
plt.tight_layout()
plt.show()
```
---
## 心路歷程
----
### 馬道中
俊騰主要是提供主題,所以Coding就由我來負責
整個撰寫的過程大概只用了兩小時就完成了
遇到的最大問題ASPX網頁爬蟲,它會在網頁中隱藏隨機的數值,而每次Query都需要這些數值,所以要手動將數值從網頁中解析出來
還有一個問題就是空軍的網站會封鎖IP,這個報告我前前後後換了7~8個IP才完成
---
## 分工表
| 成員 | 比例 |
| -------- | -------- |
| 馬道中(14) | 50% |
| 蕭俊騰(32) | 50% |
---
```cpp
return 0;
```
<!-- # 資訊課專案
怎麼呈現資料?用個網站顯示動態?
資料報表怎麼寫?趨勢怎麼顯示
1. 平均來的次數
2. 過來的比例
3. 躁起來的頻率(區域極值)
簡報呈現
1. 數據推得事件
2. 圖表上標注事件區間
3. 針對各事件做更詳細的簡報
4. 程式架構分析
-->
{"title":"資訊課專案","description":"怎麼呈現","contributors":"[{\"id\":\"53f8c472-6765-49d7-9801-8c55dba8f231\",\"add\":6012,\"del\":368},{\"id\":\"37cf564b-ae42-4dbd-b046-6a3792f568a2\",\"add\":74,\"del\":0}]"}