I am working on a python3 script that converts a word docx to a html file. When converting numbered lists, I am having difficulty preserving the original number in a numbered list. I have attached an example of what the script is doing below.
IE: Word Doc:
- Test item 1
- Test item 2
break in list for some random text
- Test item 3
- Test item 4
HTML:
- Test item 1
- Test item 2
break in list for some random text
- Test item 3
- Test item 4
I have heard this is possible with docx2python however I can't seem to figure out how, and looking through the documentation doesn't give me any indication for my specific use case.
Code is listed below, any help would be greatly appreciated.
import mammoth
import os
from bs4 import BeautifulSoup
import re
def extract_style_props(style):
props = {}
if style:
for prop in style.split(';'):
if ':' in prop:
key, value = prop.split(':')
props[key.strip()] = value.strip()
return props
def convert_word_to_html(docx_path, output_dir):
# Custom style map
style_map = """
p[style-name='Heading 1'] => h1:fresh
p[style-name='Heading 2'] => h2:fresh
p[style-name='Heading 3'] => h3:fresh
p[style-name='Heading 4'] => h4:fresh
p[style-name='Heading 5'] => h5:fresh
p[style-name='Heading 6'] => h6:fresh
p[style-name='List Paragraph'] => ol > li:fresh # Map List Paragraph to ol > li for numbered lists
p[style-name='List Bullet'] => ul > li:fresh # Keep List Bullet for bullet points
p[style-name='List Number'] => ol > li:fresh # Keep List Number for ordered (numbered) lists
"""
# Convert to HTML
with open(docx_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html = result.value
messages = result.messages
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Process each element
for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']):
style = elem.get('style', '')
props = extract_style_props(style)
# Handle text alignment
text_align = props.get('text-align')
if text_align:
elem['class'] = elem.get('class', []) + [f'align-{text_align}']
# Handle font size
font_size = props.get('font-size')
if font_size:
# Convert pt to px (approximate conversion)
if 'pt' in font_size:
size_pt = float(font_size.replace('pt', ''))
size_px = int(size_pt * 1.33)
elem['style'] = f'font-size: {size_px}px;'
else:
elem['style'] = f'font-size: {font_size};'
# Add CSS for styling
css = """
<style>
body { font-family: Arial, sans-serif; line-height: 1.6; color: #333; }
.align-left { text-align: left; }
.align-center { text-align: center; }
.align-right { text-align: right; }
.align-justify { text-align: justify; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; }
th { background-color: #f2f2f2; }
img { max-width: 100%; height: auto; }
</style>
"""
# Combine CSS and modified HTML
full_html = f"<html><head>{css}</head><body>{soup.prettify()}</body></html>"
# Save the HTML file
filename = os.path.splitext(os.path.basename(docx_path))[0] + '.html'
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(full_html)
print(f"Converted {docx_path} to {output_path}")
if messages:
print("Conversion messages:")
for message in messages:
print(message)
# Example usage
docx_path = './input.docx'
output_dir = './'
convert_word_to_html(docx_path, output_dir)