pip install wordcloud
import os
from wordcloud import WordCloud
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
raw_text = open(os.path.join(current_dir, 'document.txt'), encoding='utf-8').read()
cloud_generator = WordCloud().generate(raw_text)
cloud_generator.to_file('output.jpg')
cloud_generator.to_image().show()
| Parameter |
Type |
Default |
Description |
font_path |
string |
None |
Filepath to the desired typeface. |
width |
int |
400 |
Canvas width in pixels. |
height |
int |
200 |
Canvas height in pixels. |
margin |
int |
2 |
Pixel margin around the cloud edges. |
prefer_horizontal |
float |
0.9 |
Likelihood of horizontal text orientation; values >=1 force horizontal layout. |
mask |
nd-array or None |
None |
Array defining the cloud silhouette; words will fill the white/transparent areas. |
contour_width |
float |
0 |
Thickness of the contour line around the mask. |
contour_color |
color value |
"black" |
Color of the mask contour line. |
scale |
float |
1 |
Scaling factor for the computed layout relative to the canvas. |
min_font_size |
int |
4 |
Smallest allowed font size. |
font_step |
int |
1 |
Step size for font size iteration, affecting the size gap between words. |
max_words |
number |
200 |
Maximum number of tokens to render. |
stopwords |
set or None |
STOPWORDS |
Collection of words to exclude from the cloud. |
random_state |
int or None |
None |
Seed for the random number generator to ensure reproducibility. |
background_color |
color value |
"black" |
Background color of the canvas. |
max_font_size |
int or None |
None |
Largest allowed font size. |
mode |
string |
"RGB" |
Color mode (e.g., "RGBA" allows transparent backgrounds). |
relative_scaling |
float |
'auto' |
Impact of word frequency on font size; higher values emphasize size differences. |
color_func |
callable |
None |
Custom function to determine the color of each token. |
regexp |
string or None |
None |
Regular expression to filter input text. |
collocations |
bool |
True |
Whether to include bigrams (two-word phrases). |
colormap |
string or colormap |
"viridis" |
Matplotlib colormap for coloring the tokens. |
normalize_plurals |
bool |
True |
If True, merges plural words ending in 's' with their singular form. |
repeat |
bool |
False |
Allows words to appear multiple times based on their frequency. |
include_numbers |
bool |
False |
Whether to include numeric tokens. |
min_word_length |
int |
0 |
Minimum character count required for a token. |
collocation_threshold |
int |
30 |
Threshold for bigram collocation scoring. |
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
source_img = cv2.imread('shape_image.png')
grayscale = cv2.cvtColor(source_img, cv2.COLOR_BGR2GRAY)
_, binary_mask = cv2.threshold(grayscale, 250, 255, cv2.THRESH_BINARY_INV)
cv2.imwrite('derived_mask.png', binary_mask)
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].imshow(cv2.cvtColor(source_img, cv2.COLOR_BGR2RGB))
ax[0].set_title('Source')
ax[0].axis('off')
ax[1].imshow(binary_mask, cmap='gray')
ax[1].set_title('Mask')
ax[1].axis('off')
plt.show()
from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image
custom_mask = np.array(Image.open("derived_mask.png"))
filtered_stopwords = set(STOPWORDS)
filtered_stopwords.add("example")
cloud = WordCloud(
background_color="white",
max_words=2000,
mask=custom_mask,
stopwords=filtered_stopwords,
contour_width=3,
contour_color='steelblue'
)
cloud.generate(raw_text)
cloud.to_file("shaped_output.png")
from wordcloud import ImageColorGenerator
coloring_img = np.array(Image.open('color_source.png'))
color_gen = ImageColorGenerator(coloring_img)
colored_cloud = WordCloud(background_color="white", mask=coloring_img, max_font_size=40, random_state=42)
colored_cloud.generate(raw_text)
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
axes[0].imshow(colored_cloud, interpolation="bilinear")
axes[1].imshow(colored_cloud.recolor(color_func=color_gen), interpolation="bilinear")
axes[2].imshow(coloring_img, interpolation="bilinear")
for ax in axes:
ax.axis('off')
plt.show()
import multidict as multidict
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def build_freq_dict(corpus):
term_counts = multidict.MultiDict()
interim_dict = {}
ignored = re.compile(r"a|the|an|to|in|for|of|or|by|with|is|on|that|be")
for token in re.sub(r'\n', ' ', corpus).split():
if ignored.match(token):
continue
normalized = token.lower()
interim_dict[normalized] = interim_dict.get(normalized, 0) + 1
for key, val in interim_dict.items():
term_counts.add(key, val)
return term_counts
freq_mask = np.array(Image.open("derived_mask.png"))
freq_cloud = WordCloud(background_color="white", max_words=1000, mask=freq_mask)
freq_cloud.generate_from_frequencies(build_freq_dict(raw_text))
plt.imshow(freq_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()
import jieba
from imageio import imread
cn_font = r'C:\Windows\Fonts\simfang.ttf'
cn_text = open('chinese_doc.txt', encoding='utf-8').read()
cn_mask = imread('cn_mask.png')
custom_terms = ['特定词1', '特定词2']
def tokenize_chinese(source_text, stopwords_file):
for vocab in custom_terms:
jieba.add_word(vocab)
tokens = jieba.cut(source_text, cut_all=False)
joined = "/ ".join(tokens)
with open(stopwords_file, encoding='utf-8') as sw_file:
sw_list = sw_file.read().splitlines()
valid_words = []
for w in joined.split('/'):
w = w.strip()
if w not in sw_list and len(w) > 1:
valid_words.append(w)
return ' '.join(valid_words)
cn_cloud = WordCloud(
font_path=cn_font,
background_color="white",
max_words=2000,
mask=cn_mask,
max_font_size=100,
random_state=42,
width=1000,
height=860,
margin=2
)
cn_cloud.generate(tokenize_chinese(cn_text, 'stopwords_cn_en.txt'))
cn_cloud.to_file('chinese_result.png')
import jieba
import numpy as np
from imageio import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
cn_font = r'C:\Windows\Fonts\simfang.ttf'
text_data = open('lyrics.txt', encoding='utf-8').read()
color_source = imread('singer.jpg')
shape_mask = imread('silhouette_mask.png')
def process_mandarin(text, stopword_path):
words = jieba.cut(text, cut_all=False)
joined_tokens = "/ ".join(words)
with open(stopword_path, encoding='utf-8') as f:
stops = set(f.read().splitlines())
filtered = [w.strip() for w in joined_tokens.split('/') if w.strip() not in stops and len(w.strip()) > 1]
return ' '.join(filtered)
processed_text = process_mandarin(text_data, 'stopwords_cn_en.txt')
final_cloud = WordCloud(
font_path=cn_font,
background_color="white",
mask=shape_mask,
max_font_size=100,
random_state=42
).generate(processed_text)
color_mapper = ImageColorGenerator(np.array(color_source))
recolored_cloud = final_cloud.recolor(color_func=color_mapper)
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
axes[0].imshow(color_source)
axes[0].set_title('Original Photo')
axes[0].axis('off')
axes[1].imshow(recolored_cloud, interpolation="bilinear")
axes[1].set_title('Generated Cloud')
axes[1].axis('off')
plt.show()