Automating Weekly Planning Documents: Extracting and Merging Information Windows with Theme Knowledge
This project automates the extraction and merging of two document types—"Information Windows" and "Theme Knowledge"—into a unified horizontal A4 format for classroom wall displays. The workflow consolidates multiple processing steps intto a streamlined pipeline.
Processing Pipeline
- Standardize document filenames with consistent formatting
- Convert legacy .doc files to .docx format
- Remove empty paragraphs and line breaks from documents
- Extract content from Word documents into Excel
- Generate new documents from templates using Excel data
- Convert documents to PNG images
- Split images for classroom portal use
Step 1: Filename Standardization
Original files had inconsistent naming conventions. The script normalizes filenames to follow the pattern "Week XX Information Window + Theme Knowledge.doc" with zero-padded week numbers.
import os
import time
source_dir = r"D:\project\weekly_planning\source_docs"
def normalize_filenames(directory):
files = os.listdir(directory)
# First pass: standardize format
for filename in files:
parts = filename.split('窗')
if len(parts) == 2:
new_filename = parts[0] + '窗+主题知识.doc'
old_path = os.path.join(directory, filename)
new_path = os.path.join(directory, new_filename)
os.rename(old_path, new_path)
time.sleep(2)
# Second pass: zero-pad single digit weeks
files = os.listdir(directory)
for filename in files:
if len(filename) == 16: # Single digit week
parts = filename.split('第')
padded_name = parts[0] + '第0' + parts[1]
old_path = os.path.join(directory, filename)
new_path = os.path.join(directory, padded_name)
os.rename(old_path, new_path)
normalize_filenames(source_dir)
Step 2: DOC to DOCX Conversion
Legacy Word documents are converted to the modern Office Open XML format using the Windows COM interface.
import os
from win32com import client as wc
def convert_doc_to_docx(input_folder, output_folder):
doc_files = [f for f in os.listdir(input_folder)
if f.endswith('.doc') and not f.startswith('~$')]
word_app = wc.Dispatch("Word.Application")
for doc_file in doc_files:
input_path = os.path.join(input_folder, doc_file)
output_name = doc_file[:-3] + 'x' # Replace .doc with .docx
output_path = os.path.join(output_folder, output_name)
document = word_app.Documents.Open(input_path)
document.SaveAs(output_path, 12) # 12 = wdFormatXMLDocument
document.Close()
word_app.Quit()
base_path = r"D:\project\weekly_planning"
convert_doc_to_docx(base_path + r'\01doc', base_path + r'\02docx')
Step 3: Removing Empty Paragraphs
Documents often contain empty paragraphs that need to be cleaned before content extraction.
from docx import Document
import glob
import os
def remove_empty_paragraphs(input_dir, output_dir):
docx_files = glob.glob(os.path.join(input_dir, '*.docx'))
for file_path in docx_files:
doc = Document(file_path)
# Remove empty paragraphs
paragraphs_to_remove = []
for para in doc.paragraphs:
if not para.text.strip():
paragraphs_to_remove.append(para._element)
for element in paragraphs_to_remove:
element.getparent().remove(element)
output_path = os.path.join(output_dir, os.path.basename(file_path))
doc.save(output_path)
base_path = r"D:\project\weekly_planning"
remove_empty_paragraphs(base_path + r'\02docx', base_path + r'\03cleaned')
Step 4: Content Extraction to Excel
The script extracts structured content from Word documents, including information windows and four sections of theme knowledge (theme name, description, objectives, and home-school cooperation).
from openpyxl import load_workbook
from docx import Document
import glob
import os
def extract_content_to_excel(base_dir):
template_path = base_dir + r'\template.xlsx'
output_path = base_dir + r'\extracted_data.xlsx'
workbook = load_workbook(template_path)
sheet = workbook.active
row_index = 1
for doc_path in glob.glob(base_dir + r'\03cleaned\*.docx'):
doc = Document(doc_path)
# Extract information window content
info_content = []
for para in doc.paragraphs[1:]:
text = para.text.strip()
if text == 'Class Name': # Stop marker
break
if text:
info_content.append(' ' + text)
sheet.cell(row=row_index + 1, column=1).value = row_index
sheet.cell(row=row_index + 1, column=2).value = '\n'.join(info_content)
# Find section boundaries
total_paragraphs = len(doc.paragraphs)
section_indices = {}
for i, para in enumerate(doc.paragraphs):
if '主题说明:' in para.text:
section_indices['description_start'] = i
elif '主题目标:' in para.text:
section_indices['objectives_start'] = i
elif '家园共育:' in para.text:
section_indices['cooperation_start'] = i
# Extract theme name
theme_name = ''
for para in doc.paragraphs:
if '主题名称:' in para.text:
theme_name = para.text[5:]
break
# Extract section content
desc_start = section_indices.get('description_start', 0)
obj_start = section_indices.get('objectives_start', 0)
coop_start = section_indices.get('cooperation_start', 0)
description = '\n'.join([p.text for p in doc.paragraphs[desc_start+1:obj_start]])
objectives = '\n'.join([p.text for p in doc.paragraphs[obj_start+1:coop_start]])
cooperation = '\n'.join([p.text for p in doc.paragraphs[coop_start+1:]])
# Write to Excel columns
sheet.cell(row=row_index + 1, column=9).value = row_index
sheet.cell(row=row_index + 1, column=10).value = theme_name
sheet.cell(row=row_index + 1, column=11).value = description
sheet.cell(row=row_index + 1, column=12).value = objectives
sheet.cell(row=row_index + 1, column=13).value = cooperation
row_index += 1
# Clean up formatting
for row in sheet.iter_rows():
for cell in row:
if cell.value and isinstance(cell.value, str):
cell.value = cell.value.strip()
cell.value = cell.value.replace(' ', '')
for num in range(1, 10):
cell.value = cell.value.replace(f"{num}、", f"{num}.")
workbook.save(output_path)
workbook.close()
extract_content_to_excel(r"D:\project\weekly_planning")
Step 5: Generating Documents from Template
Using the docxtpl library, the script generates individual Word documents by merging Excel data with a template.
from docxtpl import DocxTemplate
import pandas as pd
import os
def generate_documents_from_template(base_dir):
output_dir = base_dir + r'\generated_docs'
os.makedirs(output_dir, exist_ok=True)
data = pd.read_excel(base_dir + r'\extracted_data.xlsx')
for idx in range(len(data)):
context = {
'title': str(data.loc[idx, 'title']).rstrip(),
'content': str(data.loc[idx, 'content']).rstrip(),
'classroom': str(data.loc[idx, 'classroom']).rstrip(),
'week_num': data.loc[idx, 'name'],
'period': str(data.loc[idx, 'time']).rstrip(),
'theme_title': str(data.loc[idx, 'titlename']).rstrip(),
'theme_desc': str(data.loc[idx, 'sm']).rstrip(),
'theme_obj': str(data.loc[idx, 'mb']).rstrip(),
'home_coop': str(data.loc[idx, 'gy']).rstrip(),
}
template = DocxTemplate(base_dir + r'\template_document.docx')
template.render(context)
filename = f"Week {context['week_num']:02d} {context['classroom']} Info Window Theme ({context['period']}).docx"
template.save(os.path.join(output_dir, filename))
generate_documents_from_template(r"D:\project\weekly_planning")
Workflow Summary
The complete automation pipeline processes weekly planning documents through multiple stages: filename normalization, format conversion, content cleaning, data extraction, and document generation. This approach enables rapid batch processing of educational planning materials while maintaining consistant formatting across all outputs.