Automating Microsoft Word Document Processing with Python and COM
The win32com.client module enables Python to control Microsoft Word via the Component Object Model (COM) itnerface. This capability supports direct text extraction, format conversion, and automated document generation.
Extracting Content from Word Files
The COM interface exposes the Word application object model. By instantiating the application and opening a target file, individual paragraphs can be iterated to extract raw text.
import os
import win32com.client as win32
def extract_text_from_docx(source_path):
word_app = win32.Dispatch("Word.Application")
try:
doc_obj = word_app.Documents.Open(os.path.abspath(source_path))
for section in doc_obj.Paragraphs:
text_block = section.Range.Text
if text_block.strip():
print(text_block.strip())
finally:
if 'doc_obj' in locals():
doc_obj.Close(SaveChanges=False)
word_app.Quit()
# Example usage
target_file = os.path.join(os.getcwd(), "sample_report.doc")
extract_text_from_docx(target_file)
Converting Word Documents to Plain Text
Saving a Word document in a alternative format utilizes the same COM interface. The SaveAs method accepts a numeric format code; 2 corresponds to standard plain text output.
import os
import win32com.client as win32
def convert_to_text(input_doc, output_txt):
app = win32.Dispatch("Word.Application")
app.Visible = False
try:
document = app.Documents.Open(os.path.abspath(input_doc))
# Format code 2 represents standard .txt encoding
document.SaveAs(os.path.abspath(output_txt), FileFormat=2)
finally:
if 'document' in locals():
document.Close(SaveChanges=False)
app.Quit()
source_doc = os.path.join(os.getcwd(), "draft.doc")
dest_txt = os.path.join(os.getcwd(), "draft_export.txt")
convert_to_text(source_doc, dest_txt)
Programmatic Generation of Word Files
New documents can be initialized, populated with dynamic strings, and saved to disk. The following routine demonstrates injecting personalized content into multiple files using a loop.
import os
import win32com.client as win32
def generate_report(recipient_name, output_directory):
office_app = win32.Dispatch("Word.Application")
office_app.Visible = False
new_doc = office_app.Documents.Add()
cursor = new_doc.Range(0, 0)
cursor.InsertAfter(f"Greetings {recipient_name},\n")
cursor.InsertAfter("It has been a while since we last connected. Hope everything is going well.\n")
save_path = os.path.join(os.path.abspath(output_directory), f"{recipient_name}.doc")
new_doc.SaveAs(save_path)
new_doc.Close(SaveChanges=False)
office_app.Quit()
user_list = ["Alice Chen", "Marcus Reed", "Elena Vasquez"]
project_folder = os.getcwd()
for user in user_list:
generate_report(user, project_folder)