网上参考一段代码可以按照顺序判断每一段是text还是table,可以依次进行判断:
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, _Row, Table
from docx.text.paragraph import Paragraph
def iter_block_items(parent):
if isinstance(parent, _Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
elif isinstance(parent, _Row):
parent_elm = parent._tr
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
通过如下代码使用:
for block in iter_block_items(document): # 按照文章顺序进行遍历
if isinstance(block, Paragraph): # 如果是文本
// 遍历段落文本,但是无法识别word自动编号的编号如,1. aa,(2) .aa 中的1,(2)
elif isinstance(block, Table) : # 如果是表格
// 读取表格,暂时没有问题
通过调试bock属性,暂时没有发现编号的信息,不知如何实现呢?