# 分析近期共機數量 #### 組員: 14 26 32 --- ## Github專案 [tnfsh-cs-grade-12-project](https://github.com/tobiichi3227/tnfsh-cs-grade-12-project) --- ## 摘要 透過Python爬取空軍空情動態並繪製圖表分析數量 --- ## 研究動機 我家住機場旁,飛機變多了,有點吵 想了解近期共機數量以及是什麼事件導致的 --- ## 解放軍軍機與軍艦統計圖表 ![Figure_1](https://hackmd.io/_uploads/SJwCERqlC.png) --- ## 解放軍氣球與火箭發射統計圖表 ![Figure_2](https://hackmd.io/_uploads/Bk8w1CqgC.png) --- ## 各項時間對應事件分析 - 02/01 中方宣佈M503航線取消 - 02/14~03/01 海巡金門撞人 - 03/17~03/23 金門釣客落水 - 03/28 金馬水域進行實彈射擊 - 04/03 地震 --- ## M503航線 ![M503航線](https://images.chinatimes.com/newsphoto/2024-02-05/656/B50A00_P_01_02.jpg) --- ## 程式架構 ![程式架構](https://hackmd.io/_uploads/r1ILp2qgA.png =15%x) ---- ### 爬蟲 #### 解析網頁 ```python= # 空軍的網頁架構有點混亂,所以才出現大量的select_one def get_air_activity(url) -> tuple[str, str]: html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > p > img') if img is None: img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > div > div > p > img') if img is None: img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > div > p > img') if img is None: img = soup.select_one('span#ContentPlaceHolder1_lab_Descr > div > div > img') if img is None: plain = re.search(r"([\d]*)架次", html) if plain is not None: print(plain.group(1)) ship = re.search(r"([\d]*)艘次", html) if ship is not None: print(ship.group(1)) return None, None date = re.search(r"([\d]*)", img['alt']).group() img_url = img['src'] return date, AIRFORCE_URL + img_url[8:] ``` ---- #### ASPX爬蟲核心 ```python= # ASPX的網頁每次會隨機設定__VIEWSTATE等值,而且Query需要這些參數,所以我們要從網頁中取出這些值以供下次Query使用 def get_aspx_hidden_value(url): html = requests.get(url).text viewstate = re.findall(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', html, re.I) eventvalidation = re.findall(r'<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', html, re.I) viewstategenerator = re.findall(r'input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="(.*?)" />', html, re.I) return viewstate[0], eventvalidation[0], viewstategenerator[0] ``` ---- #### 爬蟲主程式 ```python= def main(): for i in range(0, 4): viewstate, eventvalidation, viewstategenerator = get_aspx_hidden_value(AIRFORCE_NEWS_LIST_URL) data = { "__EVENTTARGET": NEWS_INDEX_QUERY.format(str(i).zfill(2)), "__EVENTVALIDATION": eventvalidation, "__VIEWSTATE": viewstate, "__VIEWSTATEGENERATOR": viewstategenerator, "__EVENTARGUMENT": "", "ctl00$txb_s": "", } html = requests.post(AIRFORCE_NEWS_LIST_URL, data=data, headers={ "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13" }).text soup = BeautifulSoup(html, 'html.parser') for link in soup.select("ul#ContentPlaceHolder1_TableList_ul_List > li > a"): date, img_url = get_air_activity(AIRFORCE_URL + link['href']) if date is None and img_url is None: continue img = get_img_from_url(img_url) print(date, get_count_from_img(img)) ``` ---- ### OCR與正則表達式 ```python= def get_count_from_img(img): # Get text from image by using OCR text = pytesseract.image_to_string(img) text = text.replace('\n', ' ') try: f = r"([\d]*) PLA aircraft" # Using regex to search data plain = re.search(f, text) plain_cnt = 0 if plain is not None and (cnt := plain.group(1)) != '': plain_cnt = int(cnt) f = r"([\d]*) PLAN vessels" ship = re.search(f, text) ship_cnt = 0 if ship is not None and (cnt := ship.group(1)) != '': ship_cnt = int(cnt) f = r"([\d]*) of the" adiz = re.search(f, text) adiz_cnt = 0 if adiz is None: if re.search(r"One of the", text): adiz_cnt = 1 elif re.search(r"Two of the", text): adiz_cnt = 2 else: cnt = adiz.group(1) if cnt != '': adiz_cnt = int(cnt) except Exception as e: import traceback print(text, e) traceback.print_exc() return (plain_cnt, ship_cnt, adiz_cnt) ``` ---- ### 生成圖表 ```python= # 這個沒什麼技術,有上課的都畫的出來 import matplotlib.pyplot as plt data = [] dates = [] for o in data: dates.append(str(o["date"])) values = [] for o in data: v1, v2, v3 = o["values"] values.append((v1, v2, v3)) # 將數據分離成3組值 plains = [v[0] for v in values] ships = [v[1] for v in values] enter_azid_plains = [v[2] for v in values] plt.figure(figsize=(20, 10)) plt.plot(dates, plains, label="plane", marker="o") plt.plot(dates, ships, label="ship", marker="o") plt.plot(dates, enter_azid_plains, label="adiz", marker="o") plt.xlabel("Date") plt.ylabel("Values") plt.title("Values Over Time") # add icon plt.legend() # show result plt.grid(True) plt.tight_layout() plt.show() ``` --- ## 心路歷程 ---- ### 馬道中 俊騰主要是提供主題,所以Coding就由我來負責 整個撰寫的過程大概只用了兩小時就完成了 遇到的最大問題ASPX網頁爬蟲,它會在網頁中隱藏隨機的數值,而每次Query都需要這些數值,所以要手動將數值從網頁中解析出來 還有一個問題就是空軍的網站會封鎖IP,這個報告我前前後後換了7~8個IP才完成 --- ## 分工表 | 成員 | 比例 | | -------- | -------- | | 馬道中(14) | 50% | | 蕭俊騰(32) | 50% | --- ```cpp return 0; ``` <!-- # 資訊課專案 怎麼呈現資料?用個網站顯示動態? 資料報表怎麼寫?趨勢怎麼顯示 1. 平均來的次數 2. 過來的比例 3. 躁起來的頻率(區域極值) 簡報呈現 1. 數據推得事件 2. 圖表上標注事件區間 3. 針對各事件做更詳細的簡報 4. 程式架構分析 -->
{"title":"資訊課專案","description":"怎麼呈現","contributors":"[{\"id\":\"53f8c472-6765-49d7-9801-8c55dba8f231\",\"add\":6012,\"del\":368},{\"id\":\"37cf564b-ae42-4dbd-b046-6a3792f568a2\",\"add\":74,\"del\":0}]"}
    263 views