Home > Tech > Content

Creating Custom Word Clouds in Python

Tech Apr 21 20

pip install wordcloud

import os
from wordcloud import WordCloud

current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
raw_text = open(os.path.join(current_dir, 'document.txt'), encoding='utf-8').read()

cloud_generator = WordCloud().generate(raw_text)
cloud_generator.to_file('output.jpg')
cloud_generator.to_image().show()

Parameter	Type	Default	Description
`font_path`	string	None	Filepath to the desired typeface.
`width`	int	400	Canvas width in pixels.
`height`	int	200	Canvas height in pixels.
`margin`	int	2	Pixel margin around the cloud edges.
`prefer_horizontal`	float	0.9	Likelihood of horizontal text orientation; values >=1 force horizontal layout.
`mask`	nd-array or None	None	Array defining the cloud silhouette; words will fill the white/transparent areas.
`contour_width`	float	0	Thickness of the contour line around the mask.
`contour_color`	color value	"black"	Color of the mask contour line.
`scale`	float	1	Scaling factor for the computed layout relative to the canvas.
`min_font_size`	int	4	Smallest allowed font size.
`font_step`	int	1	Step size for font size iteration, affecting the size gap between words.
`max_words`	number	200	Maximum number of tokens to render.
`stopwords`	set or None	STOPWORDS	Collection of words to exclude from the cloud.
`random_state`	int or None	None	Seed for the random number generator to ensure reproducibility.
`background_color`	color value	"black"	Background color of the canvas.
`max_font_size`	int or None	None	Largest allowed font size.
`mode`	string	"RGB"	Color mode (e.g., "RGBA" allows transparent backgrounds).
`relative_scaling`	float	'auto'	Impact of word frequency on font size; higher values emphasize size differences.
`color_func`	callable	None	Custom function to determine the color of each token.
`regexp`	string or None	None	Regular expression to filter input text.
`collocations`	bool	True	Whether to include bigrams (two-word phrases).
`colormap`	string or colormap	"viridis"	Matplotlib colormap for coloring the tokens.
`normalize_plurals`	bool	True	If True, merges plural words ending in 's' with their singular form.
`repeat`	bool	False	Allows words to appear multiple times based on their frequency.
`include_numbers`	bool	False	Whether to include numeric tokens.
`min_word_length`	int	0	Minimum character count required for a token.
`collocation_threshold`	int	30	Threshold for bigram collocation scoring.

import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

source_img = cv2.imread('shape_image.png')
grayscale = cv2.cvtColor(source_img, cv2.COLOR_BGR2GRAY)
_, binary_mask = cv2.threshold(grayscale, 250, 255, cv2.THRESH_BINARY_INV)
cv2.imwrite('derived_mask.png', binary_mask)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].imshow(cv2.cvtColor(source_img, cv2.COLOR_BGR2RGB))
ax[0].set_title('Source')
ax[0].axis('off')
ax[1].imshow(binary_mask, cmap='gray')
ax[1].set_title('Mask')
ax[1].axis('off')
plt.show()

from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image

custom_mask = np.array(Image.open("derived_mask.png"))
filtered_stopwords = set(STOPWORDS)
filtered_stopwords.add("example")

cloud = WordCloud(
    background_color="white",
    max_words=2000,
    mask=custom_mask,
    stopwords=filtered_stopwords,
    contour_width=3,
    contour_color='steelblue'
)
cloud.generate(raw_text)
cloud.to_file("shaped_output.png")

from wordcloud import ImageColorGenerator

coloring_img = np.array(Image.open('color_source.png'))
color_gen = ImageColorGenerator(coloring_img)

colored_cloud = WordCloud(background_color="white", mask=coloring_img, max_font_size=40, random_state=42)
colored_cloud.generate(raw_text)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
axes[0].imshow(colored_cloud, interpolation="bilinear")
axes[1].imshow(colored_cloud.recolor(color_func=color_gen), interpolation="bilinear")
axes[2].imshow(coloring_img, interpolation="bilinear")
for ax in axes:
    ax.axis('off')
plt.show()

import multidict as multidict
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def build_freq_dict(corpus):
    term_counts = multidict.MultiDict()
    interim_dict = {}
    ignored = re.compile(r"a|the|an|to|in|for|of|or|by|with|is|on|that|be")
    
    for token in re.sub(r'\n', ' ', corpus).split():
        if ignored.match(token):
            continue
        normalized = token.lower()
        interim_dict[normalized] = interim_dict.get(normalized, 0) + 1
    
    for key, val in interim_dict.items():
        term_counts.add(key, val)
    return term_counts

freq_mask = np.array(Image.open("derived_mask.png"))
freq_cloud = WordCloud(background_color="white", max_words=1000, mask=freq_mask)
freq_cloud.generate_from_frequencies(build_freq_dict(raw_text))

plt.imshow(freq_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

import jieba
from imageio import imread

cn_font = r'C:\Windows\Fonts\simfang.ttf'
cn_text = open('chinese_doc.txt', encoding='utf-8').read()
cn_mask = imread('cn_mask.png')

custom_terms = ['特定词1', '特定词2']

def tokenize_chinese(source_text, stopwords_file):
    for vocab in custom_terms:
        jieba.add_word(vocab)

    tokens = jieba.cut(source_text, cut_all=False)
    joined = "/ ".join(tokens)

    with open(stopwords_file, encoding='utf-8') as sw_file:
        sw_list = sw_file.read().splitlines()

    valid_words = []
    for w in joined.split('/'):
        w = w.strip()
        if w not in sw_list and len(w) > 1:
            valid_words.append(w)
    return ' '.join(valid_words)

cn_cloud = WordCloud(
    font_path=cn_font,
    background_color="white",
    max_words=2000,
    mask=cn_mask,
    max_font_size=100,
    random_state=42,
    width=1000,
    height=860,
    margin=2
)
cn_cloud.generate(tokenize_chinese(cn_text, 'stopwords_cn_en.txt'))
cn_cloud.to_file('chinese_result.png')

import jieba
import numpy as np
from imageio import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

cn_font = r'C:\Windows\Fonts\simfang.ttf'
text_data = open('lyrics.txt', encoding='utf-8').read()
color_source = imread('singer.jpg')
shape_mask = imread('silhouette_mask.png')

def process_mandarin(text, stopword_path):
    words = jieba.cut(text, cut_all=False)
    joined_tokens = "/ ".join(words)

    with open(stopword_path, encoding='utf-8') as f:
        stops = set(f.read().splitlines())

    filtered = [w.strip() for w in joined_tokens.split('/') if w.strip() not in stops and len(w.strip()) > 1]
    return ' '.join(filtered)

processed_text = process_mandarin(text_data, 'stopwords_cn_en.txt')

final_cloud = WordCloud(
    font_path=cn_font,
    background_color="white",
    mask=shape_mask,
    max_font_size=100,
    random_state=42
).generate(processed_text)

color_mapper = ImageColorGenerator(np.array(color_source))
recolored_cloud = final_cloud.recolor(color_func=color_mapper)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
axes[0].imshow(color_source)
axes[0].set_title('Original Photo')
axes[0].axis('off')
axes[1].imshow(recolored_cloud, interpolation="bilinear")
axes[1].set_title('Generated Cloud')
axes[1].axis('off')
plt.show()

Tags: Python

Back to List

Prev: Converting Python Scripts to Standalone Windows Executables with py2exe

Next: Implementing Sequential Lists (Dynamic Arrays) in C

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Fading Coder

Creating Custom Word Clouds in Python

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a Comment

Copyright © fadingcoder.top

Fading Coder

Creating Custom Word Clouds in Python

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a CommentCancel Reply

Copyright © fadingcoder.top

Leave a Comment