84 lines
2.0 KiB
Python
Raw Normal View History

from extra.hcqfuzz.spec import TestSpec
import random
bert_train_params = {
"DEFAULT_FLOAT": "HALF",
"SUM_DTYPE": "HALF",
"GPUS": 6,
"BS": 96,
"EVAL_BS": 96,
"FUSE_ARANGE": 1,
"FUSE_ARANGE_UINT": 0,
"BASEDIR": "/raid/datasets/wiki",
}
class TrainBert(TestSpec):
def prepare(self, dev, seed):
random.seed(seed)
self.env = {
**bert_train_params,
"IGNORE_BEAM_CACHE": 1,
"BEAM": 5,
"BEAM_UOPS_MAX": 10000,
"BEAM_UPCAST_MAX": 256,
"BEAM_LOCAL_MAX": 1024,
"BEAM_MIN_PROGRESS": 5,
"IGNORE_JIT_FIRST_BEAM": 1,
"LOGMLPERF": 0,
"SEED": seed,
}
self.cmd = "python3 examples/mlperf/model_train.py"
self.timeout = 7 * 60 * 60 # 7 hours
def get_exec_state(self): return self.env, self.cmd, self.timeout
class TrainBertShort(TestSpec):
def prepare(self, dev, seed):
random.seed(seed)
self.env = {
**bert_train_params,
"IGNORE_BEAM_CACHE": 1,
"BEAM": 5,
"BEAM_UOPS_MAX": 10000,
"BEAM_UPCAST_MAX": 256,
"BEAM_LOCAL_MAX": 1024,
"BEAM_MIN_PROGRESS": 5,
"IGNORE_JIT_FIRST_BEAM": 1,
"SEED": seed,
"BENCHMARK": 4096,
"JIT": 2
}
self.cmd = "python3 examples/mlperf/model_train.py"
self.timeout = 2 * 60 * 60 # 2 hours
def get_exec_state(self): return self.env, self.cmd, self.timeout
class BertBeam(TestSpec):
def prepare(self, dev, seed):
random.seed(seed)
self.env = {
**bert_train_params,
"IGNORE_BEAM_CACHE": 1,
"BEAM": random.choice([1, 2, 3, 4, 5]),
"BEAM_UOPS_MAX": 10000,
"BEAM_UPCAST_MAX": 256,
"BEAM_LOCAL_MAX": 1024,
"BEAM_MIN_PROGRESS": 5,
"IGNORE_JIT_FIRST_BEAM": 1,
"SEED": seed,
"RESET_STEP": 1,
"BENCHMARK": 10,
"BERT_LAYERS": 2,
"SEED": seed,
}
self.cmd = "python3 examples/mlperf/model_train.py"
self.timeout = 1 * 60 * 60 # 1 hour
def get_exec_state(self): return self.env, self.cmd, self.timeout