Automated Batch Generation of Weekly Planning Documents and Image Export via Python
Standardizing Filename Conventions
Legacy project directories often contain inconsistently named files. Normalizing these names ensures predictable path resolution and sequential processing. The following routine scans a target directory, isolates the week identifier, and applies zero-padding for single-digit entries.
import os
from pathlib import Path
target_dir = Path("./legacy_planning_files")
current_items = list(target_dir.iterdir())
# Normalize base naming pattern
for item in current_items:
if not item.is_file():
continue
stem = item.stem
if "Week" not in stem and "周" not in stem:
continue
# Extract numeric week portion
week_marker = "周" if "周" in stem else "Week"
prefix = stem.split(week_marker)[0]
proposed_name = f"{prefix}{week_marker} Information_Window.doc"
old_path = target_dir / item.name
new_path = target_dir / proposed_name
if old_path != new_path:
old_path.replace(new_path)
print("Naming convention updated.")
After basic alignment, single-digit week numbers require padding. Checking file character counts prevents over-padding longer identifiers.
for item in target_dir.iterdir():
if not item.name.endswith(".doc"):
continue
# Target single-digit week patterns
if len(item.stem) == 11 and "第0" not in item.stem:
parts = item.stem.split("第")
padded_name = f"{parts[0]}第0{parts[1]}.doc"
item.rename(target_dir / padded_name)
Migrating Legacy .doc Files to .docx
Modern document pipelines rely on the Office Open XML standard. Automating the conversion from legacy .doc formats eliminates manual export steps. The script leverages pywin32 to launch an invisible Word instance, iterate through source files, and persist them as .docx.
import os
import win32com.client as win32
from pathlib import Path
def convert_doc_to_docx(source_folder: str, dest_folder: str):
src_path = Path(source_folder)
dst_path = Path(dest_folder)
dst_path.mkdir(exist_ok=True)
# Initialize COM object for background execution
word_app = win32.Dispatch("Word.Application")
word_app.Visible = False
try:
for doc_file in src_path.glob("*.doc"):
if doc_file.name.startswith("~$"):
continue
doc_obj = word_app.Documents.Open(str(doc_file))
filename_xlsx = dst_path / f"{doc_file.stem}x"
# SaveAs constant 16 corresponds to WordML/OpenXML format
doc_obj.SaveAs(Filename=str(filename_xlsx), FileFormat=16)
doc_obj.Close()
print(f"Converted: {doc_file.name}")
finally:
word_app.Quit()
del word_app
convert_doc_to_docx("./input_docs", "./output_docsx")
Purging Unnecessary Line Breaks
Scraped or templated documents frequently contain orphaned carriage returns. These disrupt spacing and table layouts. The python-docx library allows direct manipulation of the underlying XML tree to strip empty paragraphs safely.
from docx import Document
from pathlib import Path
def cleanse_empty_paragraphs(directory_path: str):
target = Path(directory_path)
for doc_file in target.glob("*.docx"):
doc = Document(doc_file)
paragraphs_to_remove = []
for para in doc.paragraphs:
if not para.text.strip():
paragraphs_to_remove.append(para._element)
for elem in paragraphs_to_remove:
parent = elem.getparent()
if parent is not None:
parent.remove(elem)
doc.save(doc_file)
print(f"Cleaned: {doc_file.name}")
# Example usage:
# cleanse_empty_paragraphs("./cleaned_documents")
Extracting Content and Populating Spreadsheet Structures
Once documents are standardized, extracting structured text allows population of a master tracking sheet. This routine reads each processed file, skips header metadata, aggregates body text, and writes it into designated Excel columns.
import glob
from docx import Document
from openpyxl import Workbook
def extract_and_populate_spreadsheet(input_dir: str, output_excel: str):
workbook = Workbook()
sheet = workbook.active
sheet.title = "Weekly_Contents"
# Reserve headers
sheet.cell(row=1, column=1, value="Sequence_ID")
sheet.cell(row=1, column=2, value="Planned_Content")
row_counter = 1
doc_pattern = f"{input_dir}\\*.docx"
for file_path in glob.glob(doc_pattern):
doc = Document(file_path)
extracted_lines = []
# Iterate skipping potential header rows
for idx, para in enumerate(doc.paragraphs):
text = para.text.strip()
if idx == 0 and ("header" in text.lower() or "class" in text.lower()):
continue
if not text:
continue
formatted_text = f" {text}"
extracted_lines.append(formatted_text)
row_counter += 1
merged_content = "\n".join(extracted_lines)
sheet.cell(row=row_counter, column=1, value=row_counter - 1)
sheet.cell(row=row_counter, column=2, value=merged_content)
workbook.save(output_excel)
print(f"Data exported to {output_excel}")
# extract_and_populate_spreadsheet("./processed", "./master_schedule.xlsx")
Merging Data with Word Templates via Dynamic Rendering
With a consolidated dataset, individual planning sheets can be generated programmatical. Using docxtpl alongside pandas enables seamless injection of cell values in to predefined placeholder tags within a base template.
import pandas as pd
from docxtpl import DocxTemplate
from pathlib import Path
def render_weekly_templates(data_source: str, template_path: str, output_dir: str):
df = pd.read_excel(data_source)
# Clean whitespace across relevant columns
text_cols = ["week_title", "main_topic", "teacher_name", "class_group", "date_range"]
for col in text_cols:
df[col] = df[col].astype(str).str.strip()
out_path = Path(output_dir)
out_path.mkdir(exist_ok=True)
tpl = DocxTemplate(template_path)
for _, row in df.iterrows():
context_map = {
"title": row["week_title"],
"content": row["main_topic"],
"educator": row["teacher_name"],
"group": row["class_group"],
"schedule": row["date_range"]
}
tpl.render(context_map)
safe_filename = f"Week_{row['week_title']}_{row['class_group']}_Plan.docx"
tpl.save(out_path / safe_filename)
tpl = DocxTemplate(template_path) # Reset template for next iteration
# render_weekly_templates("./master_schedule.xlsx", "./base_template.docx", "./generated_plans")
Batch Processing Documents into Image Assets
For digital publishing platforms that restrict file uploads, converting finished .docx artifacts to rasterized images is often required. This pipeline copies assets, renders them as PDFs via Word Automation, and uses pymupdf to extract high-resolution page snapshots.
import os
import shutil
import win32com.client as win32
import fitz
from pathlib import Path
def prepare_image_assets(source_plans: str, staging_area: str):
src = Path(source_plans)
tmp = Path(staging_area)
tmp.mkdir(exist_ok=True)
# 1. Copy DOCX files to isolated working directory
for item in src.glob("*.docx"):
shutil.copy2(item, tmp / item.name)
# 2. Convert to PDF via COM
word = win32.Dispatch("Word.Application")
word.Visible = False
pdf_format = 17
for docx_file in tmp.glob("*.docx"):
doc = word.Documents.Open(str(docx_file))
pdf_out = docx_file.with_suffix(".pdf")
doc.SaveAs(Filename=str(pdf_out), FileFormat=pdf_format)
doc.Close()
docx_file.unlink() # Remove source DOCX post-conversion
word.Quit()
del word
# 3. Rasterize PDF pages to PNG
zoom_factor = 2.0
for pdf_file in tmp.glob("*.pdf"):
pdf_doc = fitz.open(pdf_file)
for page_num in range(len(pdf_doc)):
page = pdf_doc.load_page(page_num)
matrix = fitz.Matrix(zoom_factor, zoom_factor)
pix = page.get_pixmap(matrix=matrix)
img_out = tmp / f"{pdf_file.stem}_page_{page_num + 1}.png"
pix.save(str(img_out))
pdf_doc.close()
pdf_file.unlink() # Clean up PDFs
print(f"Image assets generated in {staging_area}")
# prepare_image_assets("./generated_plans", "./web_ready_images")
Implementation Considerations
Automating document workflows yields measurable operational improvements when designed with modularity and consistency in mind:
- Standardized Metadata Injection: Centralizing parameters such as class identifiers, educator names, and scheduling dates guarantees typographic uniformity and eliminates repetitive manual entry errors.
- Legacy Content Reuse: Repurposing established templates preserves institutional formatting standards (font families, line spacing, margin ratios) while accelerating production cycles.
- Decoupled Architecture: Separating naming normalization, format migration, content parsing, and asset export into discrete functions simplifies debugging and accommodates varying input structures across academic terms.
- Output Flexibility: Maintaining separate routing for printable documents and web-ready image exports ensures compliance with diverse distribution channels without compromising source fidelity.