Terminology-Constrained Neural Machine Translation: From GRU Seq2Seq to Transformer Architectures
Machine translation systems have evolved from rule-based approaches through statistical methods to modern neural architectures. Current research emphasizes context-aware translation, domain adaptation, and terminology-constrained generation to ensure specialized vocabulary accuracy in professional domains.
This technical implementation addresses English-to-Chinese translation with dictionary-based terminology intervention. The dataset comprises 140,000 parallel sentence pairs for training, 1,000 for validation, and 1,000 for testing, accompanied by a bilingual terminology dictionary containing 2,226 domain-specific entries. The training set optimizes model parameters, the validation set facilitates hyperparameter tuning, while the test set provides unbiased final evaluation.
GRU-Based Sequence-to-Sequence Baseline
The foundational architecture employs Gated Recurrent Units with a encoder-decoder framework, integrating terminology constraints through vocabulary engineering.
Data Pipeline with Terminology Injection
The corpus loader prioritizes dictionary terms during vocabulary construction to ensure specialized terminology receives appropriate token indices:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from torchtext.data.utils import get_tokenizer
from collections import Counter
import random
import time
class BilingualCorpus(Dataset):
def __init__(self, filepath, term_dict):
self.samples = []
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
source, target = line.strip().split('\t')
self.samples.append((source, target))
self.term_dict = term_dict
self.source_tokenizer = get_tokenizer('basic_english')
self.target_tokenizer = list
source_counter = Counter(self.term_dict.keys())
target_counter = Counter()
for src, tgt in self.samples:
source_counter.update(self.source_tokenizer(src))
target_counter.update(self.target_tokenizer(tgt))
self.source_vocab = ['<pad>', '<sos>', '<eos>'] + list(term_dict.keys()) + [w for w, _ in source_counter.most_common(10000)]
self.target_vocab = ['<pad>', '<sos>', '<eos>'] + [w for w, _ in target_counter.most_common(10000)]
self.src_stoi = {w: i for i, w in enumerate(self.source_vocab)}
self.tgt_stoi = {w: i for i, w in enumerate(self.target_vocab)}
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
src, tgt = self.samples[idx]
src_ids = [self.src_stoi.get(t, self.src_stoi['<sos>']) for t in self.source_tokenizer(src)] + [self.src_stoi['<eos>']]
tgt_ids = [self.tgt_stoi.get(t, self.tgt_stoi['<sos>']) for t in self.target_tokenizer(tgt)] + [self.tgt_stoi['<eos>']]
return torch.tensor(src_ids), torch.tensor(tgt_ids)
def pad_sequences(batch):
sources, targets = zip(*batch)
sources = nn.utils.rnn.pad_sequence(sources, padding_value=0, batch_first=True)
targets = nn.utils.rnn.pad_sequence(targets, padding_value=0, batch_first=True)
return sources, targets
Encoder-Decoder Architecture
class SourceEncoder(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
embedded = self.dropout(self.embedding(x))
outputs, hidden = self.gru(embedded)
return outputs, hidden
class TargetDecoder(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
super().__init__()
self.vocab_size = vocab_size
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
self.classifier = nn.Linear(hidden_dim, vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, input_step, hidden):
embedded = self.dropout(self.embedding(input_step))
output, hidden = self.gru(embedded, hidden)
prediction = self.classifier(output.squeeze(1))
return prediction, hidden
class Seq2SeqTranslator(nn.Module):
def __init__(self, encoder, decoder, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self, source, target, teacher_forcing_ratio=0.5):
batch_size = source.shape[0]
target_len = target.shape[1]
vocab_size = self.decoder.vocab_size
outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
_, hidden = self.encoder(source)
input_step = target[:, 0].unsqueeze(1)
for t in range(1, target_len):
output, hidden = self.decoder(input_step, hidden)
outputs[:, t, :] = output
teacher_force = random.random() < teacher_forcing_ratio
top_class = output.argmax(1)
input_step = target[:, t].unsqueeze(1) if teacher_force else top_class.unsqueeze(1)
return outputs
Training and Optimization
def load_terminology(path):
terms = {}
with open(path, 'r', encoding='utf-8') as f:
for line in f:
src_term, tgt_term = line.strip().split('\t')
terms[src_term] = tgt_term
return terms
def training_step(model, dataloader, optimizer, criterion, clip_norm):
model.train()
epoch_loss = 0
for src_batch, tgt_batch in dataloader:
src_batch = src_batch.to(device)
tgt_batch = tgt_batch.to(device)
optimizer.zero_grad()
predictions = model(src_batch, tgt_batch)
vocab_dim = predictions.shape[-1]
loss = criterion(predictions[:, 1:].reshape(-1, vocab_dim), tgt_batch[:, 1:].reshape(-1))
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(dataloader)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
terminology = load_terminology('dictionary.txt')
corpus = BilingualCorpus('train.txt', terminology)
# Subset sampling for rapid prototyping
training_subset = Subset(corpus, range(1000))
training_loader = DataLoader(training_subset, batch_size=32, shuffle=True, collate_fn=pad_sequences)
encoder = SourceEncoder(len(corpus.source_vocab), 256, 512, 2, 0.5)
decoder = TargetDecoder(len(corpus.target_vocab), 256, 512, 2, 0.5)
model = Seq2SeqTranslator(encoder, decoder, device).to(device)
optimizer = optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss(ignore_index=corpus.tgt_stoi['<pad>'])
for epoch in range(10):
avg_loss = training_step(model, training_loader, optimizer, loss_function, 1.0)
print(f'Epoch: {epoch+1:02} | Loss: {avg_loss:.3f}')
torch.save(model.state_dict(), 'gru_translation_model.pth')
Inference and BLEU Evaluation
The BLEU (Bilingual Evaluation Understudy) metric measures n-gram overlap between generated translations and reference texts. BLEU-4 specifically evaluates four-gram precision, providing a correlation with human judgment.
from sacrebleu.metrics import BLEU
def greedy_decode(model, sentence, dataset, terminology, device, max_length=50):
model.eval()
tokens = dataset.source_tokenizer(sentence)
indices = [dataset.src_stoi.get(t, dataset.src_stoi['<sos>']) for t in tokens]
tensor = torch.LongTensor(indices).unsqueeze(0).to(device)
with torch.no_grad():
_, hidden = model.encoder(tensor)
decoded = []
current_token = torch.LongTensor([[dataset.tgt_stoi['<sos>']]]).to(device)
for _ in range(max_length):
output, hidden = model.decoder(current_token, hidden)
pred_id = output.argmax(1)
word = dataset.target_vocab[pred_id.item()]
if word == '<eos>':
break
# Terminology replacement
if word in terminology.values():
for src_w, tgt_w in terminology.items():
if word == tgt_w:
word = src_w
break
decoded.append(word)
current_token = pred_id.unsqueeze(1)
return ''.join(decoded)
def calculate_bleu(model, dataset, src_file, ref_file, terminology, device):
with open(src_file, 'r', encoding='utf-8') as f:
sources = [line.strip() for line in f]
with open(ref_file, 'r', encoding='utf-8') as f:
references = [line.strip() for line in f]
hypotheses = [greedy_decode(model, s, dataset, terminology, device) for s in sources]
metric = BLEU()
score = metric.corpus_score(hypotheses, [references])
return score
Transformer Implementation
The Transformer architecture eliminates recurrent connections in favor of multi-head self-attention, enabling parallel computation and improved long-range dependency modeling.
Positional Encoding and Model Definition
import math
class SinusoidalPositionEncoder(nn.Module):
def __init__(self, dim_model, max_seq_len=5000, dropout_rate=0.1):
super().__init__()
self.dropout = nn.Dropout(dropout_rate)
position_matrix = torch.zeros(max_seq_len, dim_model)
positions = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0) / dim_model))
position_matrix[:, 0::2] = torch.sin(positions * div_term)
position_matrix[:, 1::2] = torch.cos(positions * div_term)
position_matrix = position_matrix.unsqueeze(0).transpose(0, 1)
self.register_buffer('position_matrix', position_matrix)
def forward(self, embeddings):
seq_len = embeddings.size(0)
embeddings = embeddings + self.position_matrix[:seq_len, :]
return self.dropout(embeddings)
class TransformerTranslator(nn.Module):
def __init__(self, src_vocab, tgt_vocab, hidden_dim, attention_heads, enc_layers, dec_layers, ff_dim, dropout):
super().__init__()
self.core_transformer = nn.Transformer(
d_model=hidden_dim,
nhead=attention_heads,
num_encoder_layers=enc_layers,
num_decoder_layers=dec_layers,
dim_feedforward=ff_dim,
dropout=dropout,
batch_first=False
)
self.src_embedding = nn.Embedding(len(src_vocab), hidden_dim)
self.tgt_embedding = nn.Embedding(len(tgt_vocab), hidden_dim)
self.pos_encoder = SinusoidalPositionEncoder(hidden_dim, dropout_rate=dropout)
self.output_projection = nn.Linear(hidden_dim, len(tgt_vocab))
self.src_vocab_map = src_vocab
self.tgt_vocab_map = tgt_vocab
self.scale_factor = math.sqrt(hidden_dim)
def forward(self, source_seq, target_seq):
# Transpose to (seq_len, batch_size) for transformer
source_seq = source_seq.transpose(0, 1)
target_seq = target_seq.transpose(0, 1)
# Generate masks
src_mask = self.core_transformer.generate_square_subsequent_mask(source_seq.size(0)).to(source_seq.device)
tgt_mask = self.core_transformer.generate_square_subsequent_mask(target_seq.size(0)).to(target_seq.device)
src_padding_mask = (source_seq == self.src_vocab_map['<pad>']).transpose(0, 1)
tgt_padding_mask = (target_seq == self.tgt_vocab_map['<pad>']).transpose(0, 1)
# Embeddings with scaling and positional encoding
src_emb = self.pos_encoder(self.src_embedding(source_seq) * self.scale_factor)
tgt_emb = self.pos_encoder(self.tgt_embedding(target_seq) * self.scale_factor)
transformer_out = self.core_transformer(
src_emb, tgt_emb, src_mask, tgt_mask,
None, src_padding_mask, tgt_padding_mask, src_padding_mask
)
return self.output_projection(transformer_out).transpose(0, 1)
The Transformer implementation utilizes stacked self-attention mechanisms and feed-forward networks, replacing sequential processing with parallelizable matrix operations. Positional encodings inject sequence order information through sinusoidal functions, while multi-head attention captures diverse contextual relationships across the input sequence.