在PDF上擷取表格並輸出成CSV、EXCEL檔案

# 擷取PDF單一頁面的表格 ```python= import pdfplumber import pandas as pd # 打開PDF文件 with pdfplumber.open("study_list.pdf") as pdf: # 獲取PDF的第一頁 first_page = pdf.pages[0] # 提取表格 table = first_page.extract_table() # 將表格轉換為DataFrame df = pd.DataFrame(table[1:], columns=table[0]) # 儲存成CSV文件 df.to_csv("output.csv", index=False) print("Table extracted and saved to output.csv") ``` * study_list.pdf ![image](https://hackmd.io/_uploads/rk3SVJStA.png) * output.csv ![image](https://hackmd.io/_uploads/rJrwEyHtA.png) # 從多頁的PDF擷取指定表格 ```python= import pdfplumber import pandas as pd # 獲得輸入的頁碼和表格索引 page_number = int(input("请輸入要提取表格的頁碼（從1開始）：")) table_index = int(input("请輸入要提取表格的索引（從1開始）：")) # 打開PDF文件 with pdfplumber.open("Test_Fake_File.pdf") as pdf: # 檢查頁碼是否有效 if page_number > len(pdf.pages) or page_number < 1: raise ValueError("無效的頁碼") # 獲得頁碼對應的頁面 page = pdf.pages[page_number - 1] # 提取頁面中所有表格 tables = page.extract_tables() # 檢查表格索引是否有效 if table_index > len(tables) or table_index < 1: raise ValueError("無效的表格索引") # 抓取指定的表格 specific_table = tables[table_index -1] # 將表格轉換為DataFrame df = pd.DataFrame(specific_table[1:], columns=specific_table[0]) # 儲存成Excel文件 df.to_excel("output.xlsx", index=False) print("Table extracted and saved to output.xlsx") ``` * Test_Fake_File.pdf ![image](https://hackmd.io/_uploads/B1YxIJSY0.png) ![image](https://hackmd.io/_uploads/r19zL1SF0.png) ![image](https://hackmd.io/_uploads/rylEIkrFC.png) * output.xlsx (擷取條件：第2頁第2個表格) ![image](https://hackmd.io/_uploads/HkAsL1BYA.png) ![image](https://hackmd.io/_uploads/H1i_U1rFR.png) # 處理具有合併儲存格的表格 ```python= import pdfplumber import pandas as pd from openpyxl import Workbook from openpyxl.utils import get_column_letter from openpyxl.styles import Font, Alignment def extract_table_from_pdf(pdf_path, page_number, table_index): # 打開 PDF 文件 with pdfplumber.open(pdf_path) as pdf: # 檢查頁碼是否有效 if page_number > len(pdf.pages) or page_number < 1: raise ValueError("無效的頁碼") # 獲取指定頁 page = pdf.pages[page_number - 1] # 頁碼從1開始，索引從0開始 # 提取頁面中的所有表格 tables = page.extract_tables() # 檢查表格索引是否有效 if table_index > len(tables) or table_index < 1: raise ValueError("無效的表格索引") # 獲取指定表格 specific_table = tables[table_index - 1] # 表格索引從1開始，索引從0開始 return specific_table def create_excel_with_merged_cells(table_data, output_path, merge_index): wb = Workbook() ws = wb.active # 填充表格數據 for i, row in enumerate(table_data): for j, cell in enumerate(row): if cell is not None: ws.cell(row=i+1, column=j+1, value=cell) # 自動調整欄寬 for col in ws.columns: max_length = 0 column = col[0].column # 取得列字母 for cell in col: try: if len(str(cell.value)) > max_length: max_length = len(cell.value) except: pass adjusted_width = (max_length + 2) # 可以調整加的寬度 ws.column_dimensions[get_column_letter(column)].width = adjusted_width # 將合併的儲存格加粗 & 置中 ws.merge_cells(merge_index) mergecell = ws[merge_index.split(':')[0]] mergecell.font = Font(bold=True) # 設定字型為粗體 mergecell.alignment = Alignment(horizontal='center', vertical='center') # 設定置中對齊 # 保存 Excel 文件 wb.save(output_path) def main(): # 獲取使用者輸入的頁碼和表格索引 pdf_path = "Text_Fake_File_combine.pdf" page_number = int(input("請輸入要提取表格的頁碼（從1開始）：")) table_index = int(input("請輸入要提取表格的索引（從1開始）：")) merge_index = str(input("請輸入合併的儲存格座標（例如A1:A4、或B1:C2等格式）：")) # 提取表格 table_data = extract_table_from_pdf(pdf_path, page_number, table_index) # 保存為 Excel 文件 output_path = "output_combine.xlsx" create_excel_with_merged_cells(table_data, output_path, merge_index) print(f"Table extracted and saved to {output_path}") if __name__ == "__main__": main() ``` * Text_Fake_File_combine.pdf ![image](https://hackmd.io/_uploads/rkrDUWIK0.png) ![image](https://hackmd.io/_uploads/BktO8bLY0.png) ![image](https://hackmd.io/_uploads/SJAcFZItR.png) ## 情境1：表頭在最上方 ==(擷取條件：第1頁第1個表格，表頭在第一行)== ![image](https://hackmd.io/_uploads/HynT1G8tA.png) ![image](https://hackmd.io/_uploads/BJE1jZIFR.png) ## 情境2：表頭在中間 ==(擷取條件：第2頁第1個表格，表頭在第二行)== ![image](https://hackmd.io/_uploads/SyK7eGLKR.png) ![image](https://hackmd.io/_uploads/ryPtj-IKR.png) ## 情境3：表頭在奇怪的位置 ==(擷取條件：第3頁第1個表格，表頭在第中間跨行，約是B1:C2)== ![image](https://hackmd.io/_uploads/HJfOefIY0.png) ![image](https://hackmd.io/_uploads/rytFlG8FA.png) # 使用OCR辨識單一圖片內的表格內容透過`pytesseract`套件對圖片進行光學辨識，首先要去[tesseract的GitHub](https://github.com/tesseract-ocr/tesseract?tab=readme-ov-file)下載驅動程式： ![image](https://hackmd.io/_uploads/HJoswN22A.png) ![image](https://hackmd.io/_uploads/B1-UONh30.png) ![image](https://hackmd.io/_uploads/BkA_d4h2C.png) ```python= import pytesseract from PIL import Image import pandas as pd ocrpath = "Your tesseract path." # 如果你使用的是Windows，設置Tesseract的路徑 pytesseract.pytesseract.tesseract_cmd = ocrpath # 打開圖片 img_path = 'table_image.png' # 圖片文件的路徑 image = Image.open(img_path) # 使用OCR識別圖片中的表格文本 ocr_result = pytesseract.image_to_string(image, lang='chi_tra') # 將OCR的結果進行處理，將文本轉換為列表 lines = ocr_result.split("\n") data = [] for line in lines: # 根據圖片中的排版，通過分隔符（如空格、Tab）來拆分表格行 row = line.split() if row: # 如果該行有內容 data.append(row) # 使用pandas將數據轉換為表格格式 df = pd.DataFrame(data) # 將表格數據保存為Excel文件 df.to_excel("output_ocr.xlsx", index=False, header=False) print("表格已成功轉換並保存至 'output_ocr.xlsx'") ``` * table_image.png ![table_image](https://hackmd.io/_uploads/B1dGFEn20.png) ==但是中文辨識的成效並不太好......== ![image](https://hackmd.io/_uploads/r1Z4tN22C.png) # 處理被旋轉過的PDF表格這邊以聯詠2023年的年報為範例(LY_2023.pdf)： ![pdf_ss_13](https://hackmd.io/_uploads/BkU-sN230.png) 首先，把被旋轉過的頁面轉回來。 ```python= import fitz # PyMuPDF # 打開 PDF 文件 pdf_file = "LY_2023.pdf" # 替換成你的 PDF 檔案 output_pdf = "rotated_output_LY.pdf" # 保存結果的文件名稱 doc = fitz.open(pdf_file) # 要旋轉的頁面列表 (頁碼從 1 開始) pages_to_rotate = [19, 20, 21, 22] rotation_angle = 90 # 順時針旋轉 90 度 # 針對每個指定頁面進行旋轉 for page_num in pages_to_rotate: page = doc.load_page(page_num - 1) # PDF 頁碼從 0 開始 page.set_rotation(rotation_angle) # 設定旋轉角度 # 保存旋轉後的 PDF 文件 doc.save(output_pdf) doc.close() print(f"Rotated pages saved to '{output_pdf}'") ``` * rotated_output_LY.pdf ![image](https://hackmd.io/_uploads/HyuYiVh2C.png) :::warning rotation_angle在少數情況下需要設定成45度，才會轉動頁面，原理目前還不清楚。 ::: 然後，重新用pdfplumber讀取表格資料： ```python= import pdfplumber import pandas as pd # 打開PDF文件 with pdfplumber.open("rotated_output_LY.pdf") as pdf: # 獲取PDF的第一頁 first_page = pdf.pages[12] # 提取表格 table = first_page.extract_table() # 將表格轉換為DataFrame df = pd.DataFrame(table[1:]) print(df) # 儲存成CSV文件 df.to_csv("output_GY.csv", index=False) print("Table extracted and saved to output.csv") ``` ![image](https://hackmd.io/_uploads/HyopoEnnR.png) # 處理斜線標題表格 ![pdf_ss_15](https://hackmd.io/_uploads/rJDAkB23C.png) 其實pdfplumber也可以正常讀取資料，但這邊想呈現的效果，是用箭頭指出橫向標題是"條件"，縱向標題是"姓名"： ```python= import pdfplumber import pandas as pd # 打開PDF文件 with pdfplumber.open("LY_2023.pdf") as pdf: # 獲取PDF的第一頁 first_page = pdf.pages[15] # 提取表格 table = first_page.extract_table() # 將表格轉換為DataFrame df = pd.DataFrame(table[0:]) # 取得 [0, 0] 的欄位內容 cell_content = df.iloc[0, 0] # 使用 '\n' 切割內容 title = cell_content.split('\n') # 覆寫 [0, 0] 的內容為新的格式 "{title[0]}→\\n{title[1]↓}" if len(title) > 1: df.iloc[0, 0] = f"{title[0]}→\n{title[1]}↓" else: print("The content does not contain a '\\n' to split.") print(df) # 儲存成CSV文件 df.to_csv("output_slope.csv", index=False) print("Table extracted and saved to output.csv") ``` ![image](https://hackmd.io/_uploads/rk0heHn3C.png) # 小工具：PDF頁面擷取可以將指定頁數的PDF頁面截圖存檔。 ```python= import fitz # PyMuPDF # 打開 PDF 文件 pdf_file = 'LY_2023.pdf' # 替換成你的 PDF 檔案路徑 doc = fitz.open(pdf_file) # 需要截圖的頁面 pages_to_screenshot = [13] # 針對每個頁面進行截圖並保存 for page_num in pages_to_screenshot: page = doc.load_page(page_num - 1) # PyMuPDF 的頁面是從 0 開始計算 pix = page.get_pixmap() # 將頁面轉換為圖像 image_file = f'pdf_ss_{page_num}.png' pix.save(image_file) # 保存為 PNG 檔案 print(f'Saved: {image_file}') # 關閉 PDF 文件 doc.close() ``` :::danger **★ 關於PDF複製內容後出現亂碼** 在複製聚陽(1477)的年報內容時出現以下亂碼，檢查後發現文件有內嵌以下字體： ![image](https://hackmd.io/_uploads/B1zqu__3R.png) ![image](https://hackmd.io/_uploads/ryXpuuu2C.png) * DFKaiShu-SB-Estd-Bf-ETen-B5-H * Helvetica-Bold * PMingLiU-ETen-B5-H * PalatinoLinotype-Bold * PalatinoLinotype-Roman * TimesNewRomanPS-BoldMT * TimesNewRomanPSMT PDF檔案會正常顯示，不過當複製貼上、以及用程式讀取PDF內容時，就會變成亂碼，下載字體後仍無法正常讀取，將PDF轉成圖片進行OCR效果不好，Google文件、文件轉換程式、修改檔案編碼等方法只能得到充滿亂碼的新文件，目前仍不知道該如何處理。 - [字型為「DFKAI-sb 標楷體」，pdf中的字型會變成破字](https://forum.aspose.com/t/aspose-pdf-html-pdf-dfkai-sb-pdf/169286) - [How to solve no unicode mapping error from PDFBox?](https://stackoverflow.com/questions/58829597/how-to-solve-no-unicode-mapping-error-from-pdfbox)