Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

Creating Custom Word Clouds in Python

Tech 2
pip install wordcloud
import os
from wordcloud import WordCloud

current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
raw_text = open(os.path.join(current_dir, 'document.txt'), encoding='utf-8').read()

cloud_generator = WordCloud().generate(raw_text)
cloud_generator.to_file('output.jpg')
cloud_generator.to_image().show()
Parameter Type Default Description
font_path string None Filepath to the desired typeface.
width int 400 Canvas width in pixels.
height int 200 Canvas height in pixels.
margin int 2 Pixel margin around the cloud edges.
prefer_horizontal float 0.9 Likelihood of horizontal text orientation; values >=1 force horizontal layout.
mask nd-array or None None Array defining the cloud silhouette; words will fill the white/transparent areas.
contour_width float 0 Thickness of the contour line around the mask.
contour_color color value "black" Color of the mask contour line.
scale float 1 Scaling factor for the computed layout relative to the canvas.
min_font_size int 4 Smallest allowed font size.
font_step int 1 Step size for font size iteration, affecting the size gap between words.
max_words number 200 Maximum number of tokens to render.
stopwords set or None STOPWORDS Collection of words to exclude from the cloud.
random_state int or None None Seed for the random number generator to ensure reproducibility.
background_color color value "black" Background color of the canvas.
max_font_size int or None None Largest allowed font size.
mode string "RGB" Color mode (e.g., "RGBA" allows transparent backgrounds).
relative_scaling float 'auto' Impact of word frequency on font size; higher values emphasize size differences.
color_func callable None Custom function to determine the color of each token.
regexp string or None None Regular expression to filter input text.
collocations bool True Whether to include bigrams (two-word phrases).
colormap string or colormap "viridis" Matplotlib colormap for coloring the tokens.
normalize_plurals bool True If True, merges plural words ending in 's' with their singular form.
repeat bool False Allows words to appear multiple times based on their frequency.
include_numbers bool False Whether to include numeric tokens.
min_word_length int 0 Minimum character count required for a token.
collocation_threshold int 30 Threshold for bigram collocation scoring.
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

source_img = cv2.imread('shape_image.png')
grayscale = cv2.cvtColor(source_img, cv2.COLOR_BGR2GRAY)
_, binary_mask = cv2.threshold(grayscale, 250, 255, cv2.THRESH_BINARY_INV)
cv2.imwrite('derived_mask.png', binary_mask)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].imshow(cv2.cvtColor(source_img, cv2.COLOR_BGR2RGB))
ax[0].set_title('Source')
ax[0].axis('off')
ax[1].imshow(binary_mask, cmap='gray')
ax[1].set_title('Mask')
ax[1].axis('off')
plt.show()
from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image

custom_mask = np.array(Image.open("derived_mask.png"))
filtered_stopwords = set(STOPWORDS)
filtered_stopwords.add("example")

cloud = WordCloud(
    background_color="white",
    max_words=2000,
    mask=custom_mask,
    stopwords=filtered_stopwords,
    contour_width=3,
    contour_color='steelblue'
)
cloud.generate(raw_text)
cloud.to_file("shaped_output.png")
from wordcloud import ImageColorGenerator

coloring_img = np.array(Image.open('color_source.png'))
color_gen = ImageColorGenerator(coloring_img)

colored_cloud = WordCloud(background_color="white", mask=coloring_img, max_font_size=40, random_state=42)
colored_cloud.generate(raw_text)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
axes[0].imshow(colored_cloud, interpolation="bilinear")
axes[1].imshow(colored_cloud.recolor(color_func=color_gen), interpolation="bilinear")
axes[2].imshow(coloring_img, interpolation="bilinear")
for ax in axes:
    ax.axis('off')
plt.show()
import multidict as multidict
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def build_freq_dict(corpus):
    term_counts = multidict.MultiDict()
    interim_dict = {}
    ignored = re.compile(r"a|the|an|to|in|for|of|or|by|with|is|on|that|be")
    
    for token in re.sub(r'\n', ' ', corpus).split():
        if ignored.match(token):
            continue
        normalized = token.lower()
        interim_dict[normalized] = interim_dict.get(normalized, 0) + 1
    
    for key, val in interim_dict.items():
        term_counts.add(key, val)
    return term_counts

freq_mask = np.array(Image.open("derived_mask.png"))
freq_cloud = WordCloud(background_color="white", max_words=1000, mask=freq_mask)
freq_cloud.generate_from_frequencies(build_freq_dict(raw_text))

plt.imshow(freq_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()
import jieba
from imageio import imread

cn_font = r'C:\Windows\Fonts\simfang.ttf'
cn_text = open('chinese_doc.txt', encoding='utf-8').read()
cn_mask = imread('cn_mask.png')

custom_terms = ['特定词1', '特定词2']

def tokenize_chinese(source_text, stopwords_file):
    for vocab in custom_terms:
        jieba.add_word(vocab)

    tokens = jieba.cut(source_text, cut_all=False)
    joined = "/ ".join(tokens)

    with open(stopwords_file, encoding='utf-8') as sw_file:
        sw_list = sw_file.read().splitlines()

    valid_words = []
    for w in joined.split('/'):
        w = w.strip()
        if w not in sw_list and len(w) > 1:
            valid_words.append(w)
    return ' '.join(valid_words)

cn_cloud = WordCloud(
    font_path=cn_font,
    background_color="white",
    max_words=2000,
    mask=cn_mask,
    max_font_size=100,
    random_state=42,
    width=1000,
    height=860,
    margin=2
)
cn_cloud.generate(tokenize_chinese(cn_text, 'stopwords_cn_en.txt'))
cn_cloud.to_file('chinese_result.png')
import jieba
import numpy as np
from imageio import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

cn_font = r'C:\Windows\Fonts\simfang.ttf'
text_data = open('lyrics.txt', encoding='utf-8').read()
color_source = imread('singer.jpg')
shape_mask = imread('silhouette_mask.png')

def process_mandarin(text, stopword_path):
    words = jieba.cut(text, cut_all=False)
    joined_tokens = "/ ".join(words)

    with open(stopword_path, encoding='utf-8') as f:
        stops = set(f.read().splitlines())

    filtered = [w.strip() for w in joined_tokens.split('/') if w.strip() not in stops and len(w.strip()) > 1]
    return ' '.join(filtered)

processed_text = process_mandarin(text_data, 'stopwords_cn_en.txt')

final_cloud = WordCloud(
    font_path=cn_font,
    background_color="white",
    mask=shape_mask,
    max_font_size=100,
    random_state=42
).generate(processed_text)

color_mapper = ImageColorGenerator(np.array(color_source))
recolored_cloud = final_cloud.recolor(color_func=color_mapper)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
axes[0].imshow(color_source)
axes[0].set_title('Original Photo')
axes[0].axis('off')
axes[1].imshow(recolored_cloud, interpolation="bilinear")
axes[1].set_title('Generated Cloud')
axes[1].axis('off')
plt.show()
Tags: Python

Related Articles

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.