The non_readable_table.docx is a little bit strange as there is no content in default body elements. All content - the both tables - are in text boxes and therefore are text box content.
Python-docx is not aware of text boxes. Only inline shapes are supported. But python-docx retains the full XML of the source *.docx file. So one can get the text box content out of the XML using XML methods.
The following example code does this and retrieves tables from the text box content if they exist.
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table
from docx.text.hyperlink import Hyperlink
from docx.text.run import Run
def get_tables_in_text_box_elements(run_element: Run) -> [Table]:
tables = []
txbxContent_elements = run_element.element.xpath('./*/*/w:drawing/*/a:graphic/*/*/*/w:txbxContent')
for txbxContent in txbxContent_elements:
table_elements = txbxContent.xpath('.//w:tbl', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
for table_element in table_elements:
table = Table(table_element, None)
tables.append(table)
return tables
document = Document('non_readable_table.docx')
body = document._body
for body_element in body.iter_inner_content():
if isinstance(body_element, Paragraph):
print(f'Paragraph-element: {body_element}')
for run_element in body_element.iter_inner_content():
if isinstance(run_element, Run):
print(f'Run-element: {run_element}, Text: {run_element.text}')
tables = get_tables_in_text_box_elements(run_element)
for table in tables:
print(f'Table-element: {table}')
for row in table.rows:
row_data = []
for cell in row.cells:
for paragraph in cell.paragraphs:
row_data.append(paragraph.text)
print('\t'.join(row_data))
elif isinstance(run_element, Hyperlink):
print(f'Hyperlink-element: {run_element}, Address: {run_element.address}')
else:
print('unknown run element')
elif isinstance(body_element, Table):
print(f'Table-element: {body_element}')
else:
print('unknown body element')