Visualizing a Synthetic Fruit Dataset in Python: Bananas vs Apples
This walkthrough revisits core plotting techniques in Python using a small, simulated dataset for two fruit categories. Each fruit is descrbied by two features: cross‑section length and cross‑section width. We will render scatter plots, bar charts with error bars, histograms, box plots, multiple subplots, and a contour plot illustrating a linear classification boundary.
Enivronment and imports
The examples target a Jupyter notebook running Python 3.7.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Unified font sizes for consistent appearance
plt.rcParams.update({
'xtick.labelsize': 15,
'ytick.labelsize': 15,
'legend.fontsize': 15,
'axes.labelsize': 15,
'axes.titlesize': 15,
})
Generate a synthetic dataset
- Bananas: length ~ N(8.0, 0.6), width ~ N(4.0, 0.6)
- Apples: length ~ N(6.0, 0.6), width ~ N(6.0, 0.6)
- 1000 samples per class; class labels use 0 for bananas and 1 for apples
# Reproducibility
rng = np.random.RandomState(121)
n_per_class = 1000
sigma = 0.6
bananas = pd.DataFrame({
'length': rng.normal(loc=8.0, scale=sigma, size=n_per_class),
'width': rng.normal(loc=4.0, scale=sigma, size=n_per_class),
'class': 0
})
apples = pd.DataFrame({
'length': rng.normal(loc=6.0, scale=sigma, size=n_per_class),
'width': rng.normal(loc=6.0, scale=sigma, size=n_per_class),
'class': 1
})
fruits = pd.concat([bananas, apples], ignore_index=True)
Visual exploration
Scatter plot
Plot length versus width for both categories.
fig, ax = plt.subplots(figsize=(6, 5))
ax.scatter(apples['length'], apples['width'], s=20, alpha=0.6, label='apples')
ax.scatter(bananas['length'], bananas['width'], s=20, alpha=0.6, label='bananas')
ax.set_xlabel('Length')
ax.set_ylabel('Width')
ax.set_title('Fruit measurements')
ax.legend()
ax.grid(alpha=0.3)
# Equivalent form using column names and a data mapping:
# ax.scatter('length', 'width', data=apples, s=20, alpha=0.6, label='apples')
# ax.scatter('length', 'width', data=bananas, s=20, alpha=0.6, label='bananas')
Bar chart with error bars
Compare mean length and width with 2× standard deviation as error bars.
# Reshape and summarize
long_df = (fruits
.melt(id_vars='class', value_vars=['length', 'width'],
var_name='feature', value_name='value'))
summary = (long_df
.groupby(['feature', 'class'])
.agg(mean=('value', 'mean'), std=('value', 'std'))
.reset_index()
.sort_values(['feature', 'class']))
# x positions: length(banana,apple), width(banana,apple)
x_positions = [0.0, 0.4, 1.2, 1.6]
bar_heights = summary['mean'].to_numpy()
bar_errors = 2 * summary['std'].to_numpy() # 2σ error bars
colors = ['tab:blue', 'tab:blue', 'tab:orange', 'tab:orange'] # feature-colored
labels = ['bananas', 'apples', 'bananas', 'apples']
fig, ax = plt.subplots(figsize=(7, 5))
ax.bar(x_positions, bar_heights, yerr=bar_errors,
width=0.35, color=colors, error_kw={'lw': 3, 'capthick': 1.5})
ax.set_xticks(x_positions)
ax.set_xticklabels(labels)
ax.set_title('Mean length vs width (±2σ)')
ax.set_ylabel('Value')
ax.grid(axis='y', alpha=0.3)
Histogram
Overlay histograms for the length distribution by class.
bins = np.linspace(fruits['length'].min() - 0.5, fruits['length'].max() + 0.5, 30)
fig, ax = plt.subplots(figsize=(6, 4))
ax.hist(bananas['length'], bins=bins, alpha=0.6, density=True, label='bananas')
ax.hist(apples['length'], bins=bins, alpha=0.6, density=True, label='apples')
ax.set_xlabel('Length')
ax.set_ylabel('Density')
ax.set_title('Length distribution by class')
ax.legend()
ax.grid(axis='y', alpha=0.3)
Box plot
Side‑by‑side box plots for the length feature.
fig, ax = plt.subplots(figsize=(5, 4))
box = ax.boxplot([bananas['length'], apples['length']],
labels=['bananas', 'apples'],
widths=[0.35, 0.35],
patch_artist=True)
# Color the boxes
for patch, col in zip(box['boxes'], ['tab:blue', 'tab:orange']):
patch.set_facecolor(col)
ax.set_ylabel('Length')
ax.set_title('Length variability')
ax.grid(axis='y', alpha=0.3)
Small multiples (subplots)
Compose a compact dashboard with common views.
fig, axes = plt.subplots(2, 2, figsize=(11, 8))
# (0, 0) scatter
axes[0, 0].scatter(apples['length'], apples['width'], s=12, alpha=0.6, label='apples')
axes[0, 0].scatter(bananas['length'], bananas['width'], s=12, alpha=0.6, label='bananas')
axes[0, 0].set_title('Length vs Width')
axes[0, 0].set_xlabel('Length')
axes[0, 0].set_ylabel('Width')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)
# (0, 1) histogram: length
bins_len = np.linspace(fruits['length'].min() - 0.5, fruits['length'].max() + 0.5, 25)
axes[0, 1].hist(bananas['length'], bins=bins_len, alpha=0.6, density=True, label='bananas')
axes[0, 1].hist(apples['length'], bins=bins_len, alpha=0.6, density=True, label='apples')
axes[0, 1].set_title('Length distribution')
axes[0, 1].set_xlabel('Length')
axes[0, 1].set_ylabel('Density')
axes[0, 1].legend()
axes[0, 1].grid(axis='y', alpha=0.3)
# (1, 0) histogram: width
bins_wid = np.linspace(fruits['width'].min() - 0.5, fruits['width'].max() + 0.5, 25)
axes[1, 0].hist(bananas['width'], bins=bins_wid, alpha=0.6, density=True, label='bananas')
axes[1, 0].hist(apples['width'], bins=bins_wid, alpha=0.6, density=True, label='apples')
axes[1, 0].set_title('Width distribution')
axes[1, 0].set_xlabel('Width')
axes[1, 0].set_ylabel('Density')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)
# (1, 1) box plot: length
box = axes[1, 1].boxplot([bananas['length'], apples['length']],
labels=['bananas', 'apples'],
widths=[0.35, 0.35],
patch_artist=True)
for patch, col in zip(box['boxes'], ['tab:blue', 'tab:orange']):
patch.set_facecolor(col)
axes[1, 1].set_title('Length variability')
axes[1, 1].set_ylabel('Length')
axes[1, 1].grid(axis='y', alpha=0.3)
fig.tight_layout()
Contour plot: a simple linear classification boundary
Without external ML libarries, a linear discriminant can be visualized by estimating class means and a shared covariance matrix (LDA‑style). The zero level set of the discriminant function forms the decision boundary.
# Prepare matrices
X0 = bananas[['length', 'width']].to_numpy()
X1 = apples[['length', 'width']].to_numpy()
# Class means
mu0 = X0.mean(axis=0)
mu1 = X1.mean(axis=0)
# Shared covariance: average of within-class covariances
S0 = np.cov(X0, rowvar=False)
S1 = np.cov(X1, rowvar=False)
Sigma = 0.5 * (S0 + S1)
invSigma = np.linalg.inv(Sigma)
# Linear discriminant function g(x) = (mu1 - mu0)^T Σ^{-1} x + b
b = 0.5 * (mu0 @ invSigma @ mu0 - mu1 @ invSigma @ mu1) # equal priors
w = invSigma @ (mu1 - mu0)
def g(xy):
xy = np.atleast_2d(xy)
return xy @ w + b
# Grid for contouring
x_min, x_max = fruits['length'].min() - 1.0, fruits['length'].max() + 1.0
y_min, y_max = fruits['width'].min() - 1.0, fruits['width'].max() + 1.0
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
np.linspace(y_min, y_max, 300))
Z = g(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
fig, ax = plt.subplots(figsize=(6, 5))
# Light background regions for the two sides of the boundary
mask = (Z > 0).astype(int)
ax.contourf(xx, yy, mask, levels=[-0.5, 0.5, 1.5],
colors=['#dfefff', '#ffeaea'], alpha=0.4)
# Decision boundary (g(x)=0)
ax.contour(xx, yy, Z, levels=[0], colors='k', linewidths=2)
# Scatter points
ax.scatter(apples['length'], apples['width'], s=16, alpha=0.7, label='apples')
ax.scatter(bananas['length'], bananas['width'], s=16, alpha=0.7, label='bananas')
ax.set_xlabel('Length')
ax.set_ylabel('Width')
ax.set_title('Linear decision boundary (LDA-like)')
ax.legend()
ax.grid(alpha=0.3)