从零实现 LLM Training:014. Use Real Data
前面我们已经实现了张量并行,混合精度训练,checkpoint,argparse,eval and logging,现在我们可以考虑把数据搞成真实一些的,而不是只在 toy 上玩。
dataset.py
from typing import List
import torch
from torch.utils.data import Dataset
try:
from transformers import AutoTokenizer
except ImportError:
AutoTokenizer = None
def build_tokenizer(
model_name_or_path: str,
use_fast: bool = True,
):
if AutoTokenizer is None:
raise ImportError("need to install transformers")
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
use_fast=use_fast,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
class TextDatasetForCausalLM(Dataset):
def __init__(
self,
file_paths: List[str],
tokenizer,
seq_len: int,
add_eos: bool = True,
) -> None:
super().__init__()
self.tokenizer = tokenizer
self.seq_len = seq_len
self.add_eos = add_eos
texts: List[str] = []
for path in file_paths:
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
texts.append(line)
if not texts:
raise ValueError("the file is full of empty lines")
all_ids: List[int] = []
for text in texts:
if add_eos and hasattr(tokenizer, "eos_token_id"):
encoded = tokenizer.encode(
text,
add_special_tokens=False,
)
all_ids.extend(encoded)
all_ids.append(tokenizer.eos_token_id)
else:
encoded = tokenizer.encode(
text,
add_special_tokens=False,
)
all_ids.extend(encoded)
if len(all_ids) < seq_len:
raise ValueError("the total number of tokens is less than seq_len")
self.all_ids = all_ids
self.num_samples = len(all_ids) // seq_len
def __len__(self) -> int:
return self.num_samples
def __getitem__(self, idx: int):
start = idx * self.seq_len
end = start + self.seq_len
ids = self.all_ids[start:end]
input_ids = torch.tensor(ids, dtype=torch.long)
attention_mask = torch.ones(self.seq_len, dtype=torch.long)
labels = input_ids.clone()
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
}
核心实际上就是单独的 Dataset class,这里为了简单起见,直接用的 huggingface 的 tokenizer,并且切的方法也比较粗暴,就是按照 seq_len 来切成一段一段,一般业界的训练而言应该是可以以任意位置为起点去取一个 seq_len 的,可以作为后续优化,第一个数据相关的 PR 先尽可能地简单。
train_simple.py
diff --git a/rosellm/rosetrainer/train_minimal.py b/rosellm/rosetrainer/train_minimal.py
index a8883cd..ed74147 100644
--- a/rosellm/rosetrainer/train_minimal.py
+++ b/rosellm/rosetrainer/train_minimal.py
@@ -6,6 +6,7 @@ from datetime import datetime
import torch
from checkpoint import load_checkpoint, save_checkpoint
from config import GPTConfig
+from dataset import TextDatasetForCausalLM, build_tokenizer
from model import GPTModel
from torch.amp import GradScaler, autocast
from torch.utils.data import DataLoader, Dataset
@@ -40,6 +41,7 @@ def log_line(path: str, text: str | tuple[str, ...]) -> None:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "a", encoding="utf-8") as f:
f.write(str(text) + "\n")
+ print(text)
def evaluate(
@@ -88,8 +90,20 @@ def main(args: argparse.Namespace) -> None:
log_line(log_path, f"Arguments: {args}")
checkpoint_path = args.checkpoint_path
resume = args.resume
+
+ if args.use_toy_data:
+ effective_vocab_size = args.vocab_size
+ else:
+ if not args.train_data:
+ raise ValueError("--train-data is not provided")
+ tokenizer = build_tokenizer(args.tokenizer_name)
+ tokenizer_vocab_size = getattr(tokenizer, "vocab_size", None)
+ if tokenizer_vocab_size is None:
+ tokenizer_vocab_size = len(tokenizer)
+ effective_vocab_size = tokenizer_vocab_size
+
config = GPTConfig(
- vocab_size=args.vocab_size,
+ vocab_size=effective_vocab_size,
max_position_embeddings=args.max_position_embeddings,
n_layers=args.n_layers,
n_heads=args.n_heads,
@@ -98,17 +112,40 @@ def main(args: argparse.Namespace) -> None:
dropout=args.dropout,
)
model = GPTModel(config).to(device)
- full_dataset = ToyRandomDataset(
- vocab_size=config.vocab_size,
- seq_len=args.seq_len,
- num_samples=1000,
- )
- val_size = max(int(0.2 * len(full_dataset)), 1)
- train_size = len(full_dataset) - val_size
- train_dataset, val_dataset = torch.utils.data.random_split(
- full_dataset,
- [train_size, val_size],
- )
+
+ if args.use_toy_data:
+ full_dataset = ToyRandomDataset(
+ vocab_size=config.vocab_size,
+ seq_len=args.seq_len,
+ num_samples=1000,
+ )
+ val_size = max(int(0.2 * len(full_dataset)), 1)
+ train_size = len(full_dataset) - val_size
+ train_dataset, val_dataset = torch.utils.data.random_split(
+ full_dataset,
+ [train_size, val_size],
+ )
+ else:
+ train_dataset = TextDatasetForCausalLM(
+ file_paths=args.train_data,
+ tokenizer=tokenizer,
+ seq_len=args.seq_len,
+ )
+ if args.val_data:
+ val_dataset = TextDatasetForCausalLM(
+ file_paths=args.val_data,
+ tokenizer=tokenizer,
+ seq_len=args.seq_len,
+ )
+ else:
+ val_size = max(int(0.1 * len(train_dataset)), 1)
+ train_size = len(train_dataset) - val_size
+ train_dataset, val_dataset = torch.utils.data.random_split(
+ train_dataset,
+ [train_size, val_size],
+ )
+ log_line(log_path, f"train dataset size: {len(train_dataset)}")
+ log_line(log_path, f"val dataset size: {len(val_dataset)}")
train_dataloader = DataLoader(
train_dataset,
batch_size=args.batch_size,
@@ -126,65 +163,68 @@ def main(args: argparse.Namespace) -> None:
num_steps = args.num_steps
step = 0
if resume and os.path.exists(checkpoint_path):
- print(f"Resuming from checkpoint {checkpoint_path}")
+ log_line(log_path, f"Resuming from checkpoint {checkpoint_path}")
step, extra = load_checkpoint(checkpoint_path, model, optimizer, scaler)
- print(f"Resumed from step {step}")
+ log_line(log_path, f"Resumed from step {step}")
elif resume:
- print("Resume flag is set, but checkpoint not found. Starting from scratch.")
+ log_line(
+ log_path,
+ "Resume flag is set, but checkpoint not found. Starting from scratch.",
+ )
else:
- print("Starting from scratch")
- for batch in train_dataloader:
- step += 1
- if step > num_steps:
- break
- input_ids = batch["input_ids"].to(device)
- labels = batch["labels"].to(device)
- attention_mask = batch["attention_mask"].to(device)
- optimizer.zero_grad()
- if use_amp:
- with autocast(device_type=device.type):
+ log_line(log_path, "Starting from scratch")
+ while step < num_steps:
+ for batch in train_dataloader:
+ step += 1
+ if step > num_steps:
+ break
+ input_ids = batch["input_ids"].to(device)
+ labels = batch["labels"].to(device)
+ attention_mask = batch["attention_mask"].to(device)
+ optimizer.zero_grad()
+ if use_amp:
+ with autocast(device_type=device.type):
+ logits, loss = model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ labels=labels,
+ )
+ scaler.scale(loss).backward()
+ scaler.step(optimizer)
+ scaler.update()
+ else:
logits, loss = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
- scaler.scale(loss).backward()
- scaler.step(optimizer)
- scaler.update()
- else:
- logits, loss = model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- labels=labels,
- )
- loss.backward()
- optimizer.step()
- if step % 20 == 0:
- save_checkpoint(
- checkpoint_path,
- model=model,
- optimizer=optimizer,
- step=step,
- scaler=scaler if use_amp else None,
- extra={"note": "single_gpt_minimal"},
- )
- if step % 10 == 0:
- val_loss = evaluate(
- model,
- val_dataloader,
- device=device,
- use_amp=use_amp,
- )
- val_ppl = math.exp(val_loss)
- msg = (
- f"step {step} / {num_steps} ",
- f"train loss: {loss.item():.4f} ",
- f"val loss: {val_loss:.4f} ",
- f"val ppl: {val_ppl:.4f} ",
- f"amp: {use_amp}",
- )
- print(msg)
- log_line(log_path, msg)
+ loss.backward()
+ optimizer.step()
+ if step % 20 == 0:
+ save_checkpoint(
+ checkpoint_path,
+ model=model,
+ optimizer=optimizer,
+ step=step,
+ scaler=scaler if use_amp else None,
+ extra={"note": "single_gpt_minimal"},
+ )
+ if step % 10 == 0:
+ val_loss = evaluate(
+ model,
+ val_dataloader,
+ device=device,
+ use_amp=use_amp,
+ )
+ val_ppl = math.exp(val_loss)
+ msg = (
+ f"step {step} / {num_steps} ",
+ f"train loss: {loss.item():.4f} ",
+ f"val loss: {val_loss:.4f} ",
+ f"val ppl: {val_ppl:.4f} ",
+ f"amp: {use_amp}",
+ )
+ log_line(log_path, msg)
def parse_args() -> argparse.Namespace:
@@ -198,7 +238,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--max-position-embeddings",
type=int,
- default=128,
+ default=10000,
help="Max sequence length.",
)
parser.add_argument(
@@ -276,6 +316,32 @@ def parse_args() -> argparse.Namespace:
action="store_true",
help="Resume training from checkpoint.",
)
+ # data and tokenizer
+ parser.add_argument(
+ "--train-data",
+ type=str,
+ nargs="*",
+ default=[],
+ help="Path to training data",
+ )
+ parser.add_argument(
+ "--val-data",
+ type=str,
+ nargs="*",
+ default=[],
+ help="Path to val data (optional, can be auto-split from train)",
+ )
+ parser.add_argument(
+ "--tokenizer-name",
+ type=str,
+ default="gpt2",
+ help="tokenizer name",
+ )
+ parser.add_argument(
+ "--use-toy-data",
+ action="store_true",
+ help="use random toy data rather than real data",
+ )
return parser.parse_args()
单卡训练基本上就是用一下新的这个 Dataset class,然后加一些命令行选项,有了这些,就能做真实的模型训练了。
train_ddp.py
diff --git a/rosellm/rosetrainer/train_ddp.py b/rosellm/rosetrainer/train_ddp.py
index eb7dfa6..8f3f07d 100644
--- a/rosellm/rosetrainer/train_ddp.py
+++ b/rosellm/rosetrainer/train_ddp.py
@@ -7,6 +7,7 @@ import torch
import torch.distributed as dist
from checkpoint import load_checkpoint, save_checkpoint
from config import GPTConfig
+from dataset import TextDatasetForCausalLM, build_tokenizer
from model import GPTModel
from torch.amp import GradScaler, autocast
from torch.nn.parallel import DistributedDataParallel as DDP
@@ -42,6 +43,7 @@ def log_line(path: str, text: str | tuple[str, ...]) -> None:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "a", encoding="utf-8") as f:
f.write(str(text) + "\n")
+ print(text)
def evaluate_ddp(
@@ -116,8 +118,20 @@ def main(args: argparse.Namespace) -> None:
log_line(log_path, f"[rank {local_rank}] Using device: {device}")
log_line(log_path, f"Arguments: {args}")
os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
+
+ if args.use_toy_data:
+ effective_vocab_size = args.vocab_size
+ else:
+ if not args.train_data:
+ raise ValueError("--train-data is not provided")
+ tokenizer = build_tokenizer(args.tokenizer_name)
+ tokenizer_vocab_size = getattr(tokenizer, "vocab_size", None)
+ if tokenizer_vocab_size is None:
+ tokenizer_vocab_size = len(tokenizer)
+ effective_vocab_size = tokenizer_vocab_size
+
config = GPTConfig(
- vocab_size=args.vocab_size,
+ vocab_size=effective_vocab_size,
max_position_embeddings=args.max_position_embeddings,
n_layers=args.n_layers,
n_heads=args.n_heads,
@@ -132,17 +146,37 @@ def main(args: argparse.Namespace) -> None:
output_device=device.index,
find_unused_parameters=False,
)
- full_dataset = ToyRandomDataset(
- vocab_size=config.vocab_size,
- seq_len=args.seq_len,
- num_samples=1000,
- )
- val_size = max(int(0.2 * len(full_dataset)), 1)
- train_size = len(full_dataset) - val_size
- train_dataset, val_dataset = torch.utils.data.random_split(
- full_dataset,
- [train_size, val_size],
- )
+ if args.use_toy_data:
+ full_dataset = ToyRandomDataset(
+ vocab_size=config.vocab_size,
+ seq_len=args.seq_len,
+ num_samples=1000,
+ )
+ val_size = max(int(0.2 * len(full_dataset)), 1)
+ train_size = len(full_dataset) - val_size
+ train_dataset, val_dataset = torch.utils.data.random_split(
+ full_dataset,
+ [train_size, val_size],
+ )
+ else:
+ train_dataset = TextDatasetForCausalLM(
+ file_paths=args.train_data,
+ tokenizer=tokenizer,
+ seq_len=args.seq_len,
+ )
+ if args.val_data:
+ val_dataset = TextDatasetForCausalLM(
+ file_paths=args.val_data,
+ tokenizer=tokenizer,
+ seq_len=args.seq_len,
+ )
+ else:
+ val_size = max(int(0.1 * len(train_dataset)), 1)
+ train_size = len(train_dataset) - val_size
+ train_dataset, val_dataset = torch.utils.data.random_split(
+ train_dataset,
+ [train_size, val_size],
+ )
train_sampler = DistributedSampler(
train_dataset,
num_replicas=dist.get_world_size(),
@@ -173,7 +207,9 @@ def main(args: argparse.Namespace) -> None:
num_steps = args.num_steps
step = 0
if resume and os.path.exists(checkpoint_path):
- print(f"[rank {local_rank}] Resuming from checkpoint {checkpoint_path}")
+ log_line(
+ log_path, f"[rank {local_rank}] Resuming from checkpoint {checkpoint_path}"
+ )
step, extra = load_checkpoint(
checkpoint_path,
ddp_model.module,
@@ -181,13 +217,14 @@ def main(args: argparse.Namespace) -> None:
scaler,
map_location=device.type,
)
- print(f"[rank {local_rank}] Resumed from step {step}")
+ log_line(log_path, f"[rank {local_rank}] Resumed from step {step}")
elif resume and is_main_process(local_rank):
- print(
- f"[rank {local_rank}] Resume flag is set, but checkpoint not found. Starting from scratch."
+ log_line(
+ log_path,
+ f"[rank {local_rank}] Resume flag is set, but checkpoint not found. Starting from scratch.",
)
elif is_main_process(local_rank):
- print(f"[rank {local_rank}] Starting from scratch")
+ log_line(log_path, f"[rank {local_rank}] Starting from scratch")
for epoch in range(1, 1000):
train_sampler.set_epoch(epoch)
for batch in train_dataloader:
@@ -241,12 +278,11 @@ def main(args: argparse.Namespace) -> None:
f"val ppl: {val_ppl:.4f} ",
f"amp: {use_amp}",
)
- print(msg)
log_line(log_path, msg)
if step > num_steps:
break
if is_main_process(local_rank):
- print("Training finished.")
+ log_line(log_path, "Training finished.")
cleanup_distributed()
@@ -261,7 +297,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--max-position-embeddings",
type=int,
- default=128,
+ default=10000,
help="Max sequence length.",
)
parser.add_argument(
@@ -339,6 +375,32 @@ def parse_args() -> argparse.Namespace:
action="store_true",
help="Resume training from checkpoint.",
)
+ # data and tokenizer
+ parser.add_argument(
+ "--train-data",
+ type=str,
+ nargs="*",
+ default=[],
+ help="Path to training data",
+ )
+ parser.add_argument(
+ "--val-data",
+ type=str,
+ nargs="*",
+ default=[],
+ help="Path to val data",
+ )
+ parser.add_argument(
+ "--tokenizer-name",
+ type=str,
+ default="gpt2",
+ help="tokenizer name",
+ )
+ parser.add_argument(
+ "--use-toy-data",
+ action="store_true",
+ help="use toy data",
+ )
return parser.parse_args()
DDP 代码改动也比较类似。
运行
这里我们可以搞一些真实的文本语料来进行训练,比如 https://www.gutenberg.org/ebooks/11 提取一个 txt,然后把一大部分放到 data/train.txt,剩下的部分放到 data/val.txt,然后就可以进行训练,可以看到 loss 在下降。这样训练好的模型其实可以在上面做推理看看他输出的效果,我们可以在下一个 PR 加一个简单的推理能力,用于感性的效果查看。
单卡训练:
$ python train_minimal.py --train-data data/train.txt --val-data data/val.txt --tokenizer-name gpt2 --seq-len 1024 --batch-size 8 --num-steps 200
Training started at 2025-11-28 17:06:33
Using device: cuda
Arguments: Namespace(vocab_size=10000, max_position_embeddings=10000, n_layers=2, n_heads=4, d_model=128, d_ff=512, dropout=0.1, use_tensor_parallel=False, batch_size=8, seq_len=1024, num_steps=200, lr=0.0003, no_amp=False, checkpoint_path='checkpoints/minigpt_single.pt', resume=False, train_data=['data/train.txt'], val_data=['data/val.txt'], tokenizer_name='gpt2', use_toy_data=False)
train dataset size: 32
val dataset size: 46
Starting from scratch
('step 10 / 200 ', 'train loss: 10.6139 ', 'val loss: 10.5547 ', 'val ppl: 38357.6676 ', 'amp: True')
('step 20 / 200 ', 'train loss: 9.9051 ', 'val loss: 9.8221 ', 'val ppl: 18436.6312 ', 'amp: True')
('step 30 / 200 ', 'train loss: 8.9364 ', 'val loss: 8.9257 ', 'val ppl: 7523.1618 ', 'amp: True')
('step 40 / 200 ', 'train loss: 8.2415 ', 'val loss: 8.1917 ', 'val ppl: 3610.9226 ', 'amp: True')
('step 50 / 200 ', 'train loss: 7.6098 ', 'val loss: 7.5641 ', 'val ppl: 1927.7076 ', 'amp: True')
('step 60 / 200 ', 'train loss: 6.9796 ', 'val loss: 7.0511 ', 'val ppl: 1154.1602 ', 'amp: True')
('step 70 / 200 ', 'train loss: 6.5684 ', 'val loss: 6.6589 ', 'val ppl: 779.6782 ', 'amp: True')
('step 80 / 200 ', 'train loss: 6.3557 ', 'val loss: 6.3737 ', 'val ppl: 586.2495 ', 'amp: True')
('step 90 / 200 ', 'train loss: 6.0814 ', 'val loss: 6.1744 ', 'val ppl: 480.2882 ', 'amp: True')
('step 100 / 200 ', 'train loss: 5.8957 ', 'val loss: 6.0485 ', 'val ppl: 423.4948 ', 'amp: True')
('step 110 / 200 ', 'train loss: 5.7787 ', 'val loss: 5.9599 ', 'val ppl: 387.5877 ', 'amp: True')
('step 120 / 200 ', 'train loss: 5.6908 ', 'val loss: 5.8859 ', 'val ppl: 359.9240 ', 'amp: True')
('step 130 / 200 ', 'train loss: 5.5336 ', 'val loss: 5.8183 ', 'val ppl: 336.4132 ', 'amp: True')
('step 140 / 200 ', 'train loss: 5.4770 ', 'val loss: 5.7541 ', 'val ppl: 315.4703 ', 'amp: True')
('step 150 / 200 ', 'train loss: 5.3506 ', 'val loss: 5.6913 ', 'val ppl: 296.2860 ', 'amp: True')
('step 160 / 200 ', 'train loss: 5.3421 ', 'val loss: 5.6343 ', 'val ppl: 279.8512 ', 'amp: True')
('step 170 / 200 ', 'train loss: 5.2963 ', 'val loss: 5.5781 ', 'val ppl: 264.5793 ', 'amp: True')
('step 180 / 200 ', 'train loss: 5.1676 ', 'val loss: 5.5264 ', 'val ppl: 251.2452 ', 'amp: True')
('step 190 / 200 ', 'train loss: 5.1330 ', 'val loss: 5.4757 ', 'val ppl: 238.8231 ', 'amp: True')
('step 200 / 200 ', 'train loss: 5.3526 ', 'val loss: 5.4268 ', 'val ppl: 227.4290 ', 'amp: True')
分布式数据并行训练:
$ torchrun --nproc-per-node=2 train_ddp.py --train-data data/train.txt --val-data data/val.txt --tokenizer-name gpt2 --seq-len 1024 --batch-size 8 --num-steps 200
W1128 17:13:01.472000 765093 site-packages/torch/distributed/run.py:792]
W1128 17:13:01.472000 765093 site-packages/torch/distributed/run.py:792] *****************************************
W1128 17:13:01.472000 765093 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W1128 17:13:01.472000 765093 site-packages/torch/distributed/run.py:792] *****************************************
Training started at 2025-11-28 17:13:02
[rank 0] Using device: cuda:0
Arguments: Namespace(vocab_size=10000, max_position_embeddings=10000, n_layers=2, n_heads=4, d_model=128, d_ff=512, dropout=0.1, use_tensor_parallel=False, batch_size=8, seq_len=1024, num_steps=200, lr=0.0003, no_amp=False, checkpoint_path='checkpoints/minigpt_ddp.pt', resume=False, train_data=['data/train.txt'], val_data=['data/val.txt'], tokenizer_name='gpt2', use_toy_data=False)
[rank 0] Starting from scratch
('step 10 / 200 ', 'train loss: 10.6083 ', 'val loss: 10.5442 ', 'val ppl: 37956.3182 ', 'amp: True')
('step 20 / 200 ', 'train loss: 9.8831 ', 'val loss: 9.7849 ', 'val ppl: 17764.3321 ', 'amp: True')
('step 30 / 200 ', 'train loss: 8.9969 ', 'val loss: 8.8916 ', 'val ppl: 7270.7905 ', 'amp: True')
('step 40 / 200 ', 'train loss: 8.2000 ', 'val loss: 8.1592 ', 'val ppl: 3495.2977 ', 'amp: True')
('step 50 / 200 ', 'train loss: 7.5706 ', 'val loss: 7.5283 ', 'val ppl: 1859.9687 ', 'amp: True')
('step 60 / 200 ', 'train loss: 6.9359 ', 'val loss: 7.0018 ', 'val ppl: 1098.5724 ', 'amp: True')
('step 70 / 200 ', 'train loss: 6.5049 ', 'val loss: 6.6018 ', 'val ppl: 736.3947 ', 'amp: True')
('step 80 / 200 ', 'train loss: 6.1959 ', 'val loss: 6.3190 ', 'val ppl: 555.0407 ', 'amp: True')
('step 90 / 200 ', 'train loss: 5.9893 ', 'val loss: 6.1319 ', 'val ppl: 460.3293 ', 'amp: True')
('step 100 / 200 ', 'train loss: 5.8101 ', 'val loss: 6.0096 ', 'val ppl: 407.3259 ', 'amp: True')
('step 110 / 200 ', 'train loss: 5.7503 ', 'val loss: 5.9232 ', 'val ppl: 373.6082 ', 'amp: True')
('step 120 / 200 ', 'train loss: 5.6801 ', 'val loss: 5.8452 ', 'val ppl: 345.5622 ', 'amp: True')
('step 130 / 200 ', 'train loss: 5.5241 ', 'val loss: 5.7663 ', 'val ppl: 319.3581 ', 'amp: True')
('step 140 / 200 ', 'train loss: 5.3991 ', 'val loss: 5.6918 ', 'val ppl: 296.4208 ', 'amp: True')
('step 150 / 200 ', 'train loss: 5.2615 ', 'val loss: 5.6235 ', 'val ppl: 276.8460 ', 'amp: True')
('step 160 / 200 ', 'train loss: 5.2492 ', 'val loss: 5.5583 ', 'val ppl: 259.3840 ', 'amp: True')
('step 170 / 200 ', 'train loss: 5.2032 ', 'val loss: 5.4943 ', 'val ppl: 243.2923 ', 'amp: True')
('step 180 / 200 ', 'train loss: 5.1544 ', 'val loss: 5.4326 ', 'val ppl: 228.7322 ', 'amp: True')
('step 190 / 200 ', 'train loss: 5.1739 ', 'val loss: 5.3719 ', 'val ppl: 215.2690 ', 'amp: True')
('step 200 / 200 ', 'train loss: 5.0410 ', 'val loss: 5.3140 ', 'val ppl: 203.1597 ', 'amp: True')
Training finished.