[python3]读取docx每个段落下的table数据
生活随笔
收集整理的這篇文章主要介紹了
[python3]读取docx每个段落下的table数据
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
需要引入第三方庫
python -m pip install python-docx一個簡單的樣例:
背景:每個段落中有若干個表格,需要摘取部分段落中的所有表格信息并輸出到excel中。
import os import refrom docx import Document import openpyxl as opclass DocxReader:def __init__(self, srcfile, dstfile):self.srcfile = srcfileself.dstfile = dstfileself.docx = Noneself.get_document()def get_document(self):if os.path.exists(self.srcfile):self.docx = Document(self.srcfile)def get_all_interested_info(self):if not self.docx:returnstart_flag = Falsepatch_version = Nonetid = 0tables = self.docx.tablestable_size = len(tables)data = [["設備形態(tài)", "補丁號", "問題序號", "問題單號", "問題現象", "問題影響", "嚴重級別"]]interest = []for paragraph in self.docx.paragraphs:title = paragraph.text.strip()if title == "更新說明":interest.append(paragraph)breakif title == "解決的問題":start_flag = Truecontinueif not start_flag:continueinterest.append(paragraph)index = 0while index < len(interest) - 1:paragraph = interest[index]next_paragraph_ele = interest[index+1]._p.getnext()title = paragraph.text.strip()if title and re.fullmatch(r"(V\d{3}R\d{3}\w+)", title):patch_version = titleele = paragraph._p.getnext()while not ele.tag.endswith("tbl") and ele != next_paragraph_ele:ele = ele.getnext()# ele is None or ele.tag == "" 可以認為文檔已經結束if ele == next_paragraph_ele:index += 1continuewhile tables[tid]._tbl != ele:tid += 1self.handle_one_table(tables[tid], data, patch_version)index += 1self.create_excel_with_data(self.dstfile, data)@staticmethoddef handle_one_table_info(table, data, patch):if table.rows[0].cells[0].text.strip() != "問題單號":returnproblem = {}for row in table.rows:key, value = row.cellsproblem[key.text.strip()] = value.text.strip()device = problem.get("涉及的產品型號") if problem.get("涉及的產品型號") else problem.get("涉及機型")if not device:print(problem.get("問題單號"))device_type = "####"elif "系列" in device:device = device.replace("/", "").replace("、", "")devices = sorted(re.findall(r"(\S*?)系列", device))devices = [x.strip() for x in devices]device_type = ";".join(devices).strip(";")elif "/" in device:devices = sorted(device.split("/"))devices = [x.strip() for x in devices]device_type = ";".join(devices).strip(";")else:device_type = device.replace("、", ";")number = problem.get("問題單號")symptom = problem.get("問題現象")influence = problem.get("問題影響")level = problem.get("嚴重級別")index = len(data)data.append([device_type, patch, str(index), number, symptom, influence, level])@staticmethoddef create_excel_with_data(excel, data):wb = op.Workbook()ws = wb['Sheet']for row in data:ws.append(row)try:wb.save(excel)print(f"文件輸出完成: {excel}")except Exception as e:print(f"[Error]生成文件 {excel} 失敗,原因:{e}")if __name__ == "__main__":src = r"E:\test.docx"dst = r"E:\test.xlsx"doc = DocxReader(src, dst)doc.get_all_interested_info()總結
以上是生活随笔為你收集整理的[python3]读取docx每个段落下的table数据的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: VB 获取文件名后缀
- 下一篇: 软文写作是什么?如何写软文?软文标题怎样