words = [p[1] for p in pairs]
ids_ws = [tokenizer.encode(” ” + w, add_special_tokens=False)[0] for w in words]
ids_nws = [tokenizer.encode(w, add_special_tokens=False)[0] for w in words]
delta = [abs(a – b) for a, b in zip(ids_ws, ids_nws)]
x = np.arange(len(words))
width = 0.35
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.patch.set_facecolor(“#FAFAF8”)
# Left: side-by-side token IDs
ax = axes[0]
ax.set_facecolor(“#FAFAF8″)
bars1 = ax.bar(x – width/2, ids_ws, width, label=”With leading space”, color=”#3B6FE0″, alpha=0.85)
bars2 = ax.bar(x + width/2, ids_nws, width, label=”Without leading space”, color=”#E05C3B”, alpha=0.85)
ax.set_xticks(x)
ax.set_xticklabels(words, rotation=30, ha=”right”, fontsize=9)
ax.set_ylabel(“Token ID”, fontsize=10)
ax.set_title(“Token IDs: ‘ word’ vs ‘word'”, fontsize=12, fontweight=”bold”, pad=12)
ax.legend(fontsize=9)
ax.spines[[“top”, “right”]].set_visible(False)
ax.grid(axis=”y”, alpha=0.3)
for bar in bars1:
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
str(int(bar.get_height())), ha=”center”, va=”bottom”, fontsize=7, color=”#3B6FE0″)
for bar in bars2:
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
str(int(bar.get_height())), ha=”center”, va=”bottom”, fontsize=7, color=”#E05C3B”)
# Right: delta
ax2 = axes[1]
ax2.set_facecolor(“#FAFAF8”)
color_bars = [“#E05C3B” if d > 500 else “#F0A070” if d > 100 else “#A8C4F0” for d in delta]
bars3 = ax2.bar(words, delta, color=color_bars, alpha=0.9)
ax2.set_ylabel(“Absolute Token ID Distance”, fontsize=10)
ax2.set_title(“How Far Apart Are the Token IDs?”, fontsize=12, fontweight=”bold”, pad=12)
ax2.set_xticklabels(words, rotation=30, ha=”right”, fontsize=9)
ax2.spines[[“top”, “right”]].set_visible(False)
ax2.grid(axis=”y”, alpha=0.3)
for bar, d in zip(bars3, delta):
ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10,
str(d), ha=”center”, va=”bottom”, fontsize=9, fontweight=”bold”)
high = mpatches.Patch(color=”#E05C3B”, alpha=0.9, label=”> 500 apart”)
med = mpatches.Patch(color=”#F0A070″, alpha=0.9, label=”100-500 apart”)
low = mpatches.Patch(color=”#A8C4F0″, alpha=0.9, label=”< 100 apart”)
ax2.legend(handles=[high, med, low], fontsize=8)
plt.tight_layout(pad=2)
plt.suptitle(“Tokenization Artifacts: One Space, Completely Different Token”,
fontsize=14, fontweight=”bold”, y=1.02)
plt.savefig(“tokenization_artifact.png”, dpi=150, bbox_inches=”tight”, facecolor=”#FAFAF8″)
plt.show()

