Skip to content

Commit

Permalink
Merge branch 'mlcommons:master' into unet3d-rcp-fix-v3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mmarcinkiewicz authored May 8, 2023
2 parents 7623e38 + 84f6f0c commit 6ab1078
Show file tree
Hide file tree
Showing 14 changed files with 210 additions and 28 deletions.
3 changes: 3 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# These owners will be the default owners for everything in the repo.
# Unless a later match takes precedence,they will be requested for review when someone opens a pull request.
* @mlcommons/wg-benchmark-infra @mlcommons/wg-training
6 changes: 4 additions & 2 deletions mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
'training': {
'bert': 10,
'dlrm': 5,
'dlrm_dcnv2': 10,
'gnmt': 10,
'gpt3': 3,
'maskrcnn': 5,
'minigo': 10,
'resnet': 5,
Expand Down Expand Up @@ -86,9 +88,9 @@
],
'3.0': [
'bert',
'dlrm',
'dlrm_dcnv2',
'gpt3',
'maskrcnn',
'minigo',
'resnet',
'ssd',
'rnnt',
Expand Down
8 changes: 4 additions & 4 deletions mlperf_logging/compliance_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
3.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
3.0.0/closed_resnet.yaml - Per-benchmark rules, closed submissions.
3.0.0/closed_ssd.yaml
3.0.0/closed_minigo.yaml
3.0.0/closed_maskrcnn.yaml
3.0.0/closed_rnnt.yaml
3.0.0/closed_unet3d.yaml
3.0.0/closed_bert.yaml
3.0.0/closed_dlrm.yaml
3.0.0/closed_dlrm_dcnv2.yaml
3.0.0/closed_gpt3.yaml
3.0.0/open_resnet.yaml - Per-benchmark rules, closed submissions.
3.0.0/open_ssd.yaml
3.0.0/open__minigo.yaml
3.0.0/open_maskrcnn.yaml
3.0.0/open_rnnt.yaml
3.0.0/open_unet3d.yaml
3.0.0/open_bert.yaml
3.0.0/open_dlrm.yaml
3.0.0/open_dlrm_dcnv2.yaml
3.0.0/open_gpt3.yaml

### Existing config files for HPC submissions

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'minigo', 'dlrm', 'dlrmv2', 'bert', 'rnnt', 'unet3d'] "
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
POST: " enqueue_config('training_3.0.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
Expand Down
83 changes: 83 additions & 0 deletions mlperf_logging/compliance_checker/training_3.0.0/closed_gpt3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
POST: >
s['global_batch_size'] = v['value']
- KEY:
NAME: max_sequence_length
REQ: EXACTLY_ONE
CHECK: " v['value'] == 2048 "

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adam' "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 "

- KEY:
NAME: opt_end_learning_rate
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_decay_steps
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_decay_schedule
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'cosine with linear warmup' "

- KEY:
NAME: opt_adam_beta_1
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.9 "

- KEY:
NAME: opt_adam_beta_2
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.95 "

- KEY:
NAME: opt_adam_epsilon
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1e-8 "

- KEY:
NAME: opt_gradient_clip_norm
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1.0 "

- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "

- KEY:
NAME: trained_samples
REQ: EXACTLY_ONE

- KEY:
NAME: eval_samples
REQ: EXACTLY_ONE

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6"

- KEY:
NAME: init_checkpoint_step
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "

Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'minigo', 'dlrm', 'bert', 'rnnt', 'unet3d'] "
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
POST: " enqueue_config('training_3.0.0/open_{}.yaml'.format(v['value'])) "

Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] < 1.0"
ATLEAST_ONE_CHECK: "v['value'] <= 1.0"
11 changes: 8 additions & 3 deletions mlperf_logging/mllog/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
RESEARCH = "research"

# Constant values - benchmark name
DLRMv2 = "dlrmv2"
DLRM_DCNv2 = "dlrm_dcnv2"
GNMT = "gnmt"
MASKRCNN = "maskrcnn"
MINIGO = "minigo"
Expand All @@ -47,7 +47,8 @@
TRANSFORMER = "transformer"
RNNT = "rnnt"
UNET3D = "unet3d"
BERT ="bert"
BERT = "bert"
GPT3 = "gpt3"

# Constant values - model info
ADAGRAD = "adagrad"
Expand Down Expand Up @@ -91,6 +92,7 @@
EVAL_SAMPLES = "eval_samples"
SEED = "seed"
TRAIN_SAMPLES = "train_samples"
TRAINED_SAMPLES = "trained_samples"
WEIGHTS_INITIALIZATION = "weights_initialization"

# Log kyes - model hyperparameters
Expand Down Expand Up @@ -119,6 +121,7 @@
OPT_ADAM_EPSILON = "opt_adam_epsilon"
OPT_NAME = "opt_name"
OPT_BASE_LR = "opt_base_learning_rate"
OPT_END_LR = "opt_end_learning_rate"
OPT_LAMB_LR_MIN = "opt_lamb_learning_rate_min"
OPT_LAMB_LR_DECAY_POLY_POWER = "opt_lamb_learning_rate_decay_poly_power"
OPT_LAMB_WEIGHT_DECAY = "opt_lamb_weight_decay_rate"
Expand All @@ -134,6 +137,7 @@
OPT_LR_DECAY_INTERVAL = "opt_learning_rate_decay_interval"
OPT_LR_DECAY_START_STEP = "opt_learning_rate_decay_start_step"
OPT_LR_DECAY_STEPS = "opt_learning_rate_decay_steps"
OPT_LR_DECAY_SCHEDULE = "opt_learning_rate_decay_schedule"
OPT_LR_REMAIN_STEPS = "opt_learning_rate_remain_steps"
OPT_LR_TRAINING_STEPS = "opt_learning_rate_training_steps"
OPT_LR_WARMUP_EPOCHS = "opt_learning_rate_warmup_epochs"
Expand All @@ -155,7 +159,8 @@
MODEL_EVAL_EMA_FACTOR = "model_eval_ema_factor"
MODEL_WEIGHTS_INITIALIZATION_SCALE = "model_weights_initialization_scale"
EVAL_MAX_PREDICTION_SYMBOLS = "eval_max_prediction_symbols"
START_WARMUP_STEP ="start_warmup_step"
START_WARMUP_STEP = "start_warmup_step"
INIT_CHECKPOINT_STEP = "init_checkpoint_step"

# Log keys - misc.
BBOX = "bbox"
Expand Down
4 changes: 2 additions & 2 deletions mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
submission_runs = {
"training": {
'bert': 10,
'dlrm': 5,
'dlrmv2': 5,
'dlrm_dcnv2': 10,
'gpt3': 3,
'maskrcnn' : 5,
'resnet' : 5,
'ssd' : 5,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{

"dlrmv2_ref_32768": {
"Benchmark": "dlrmv2",
"dlrm_dcnv2_ref_32768": {
"Benchmark": "dlrm_dcnv2",
"Creator": "NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "DGX-A100",
Expand All @@ -24,8 +24,8 @@
]
},

"dlrmv2_ref_65536": {
"Benchmark": "dlrmv2",
"dlrm_dcnv2_ref_65536": {
"Benchmark": "dlrm_dcnv2",
"Creator": "NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "DGX-A100",
Expand All @@ -48,8 +48,8 @@
]
},

"dlrmv2_ref_102400": {
"Benchmark": "dlrmv2",
"dlrm_dcnv2_ref_102400": {
"Benchmark": "dlrm_dcnv2",
"Creator": "NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "DGX-A100",
Expand Down
78 changes: 78 additions & 0 deletions mlperf_logging/rcp_checker/training_3.0.0/rcps_gpt3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{

"gpt3_ref_1536":
{
"Benchmark": "gpt3",
"Creator": "Google & NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM",
"BS": 1536,
"Hyperparams": {
"opt_base_learning_rate": 2e-5
},
"Epochs to converge": [
1157627904, 1157627904, 1157627904, 1258291200, 1207959552, 1258291200
]
},

"gpt3_ref_2048":
{
"Benchmark": "gpt3",
"Creator": "Google & NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM",
"BS": 2048,
"Hyperparams": {
"opt_base_learning_rate": 2e-5
},
"Epochs to converge": [
1157627904, 1207959552, 1157627904, 1207959552, 1207959552, 1157627904
]
},

"gpt3_ref_3072":
{
"Benchmark": "gpt3",
"Creator": "Google & NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM",
"BS": 3072,
"Hyperparams": {
"opt_base_learning_rate": 2e-5
},
"Epochs to converge": [
1258291200, 1207959552, 1207959552, 1207959552, 1207959552, 1207959552
]
},

"gpt3_ref_4096":
{
"Benchmark": "gpt3",
"Creator": "Google & NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM",
"BS": 4096,
"Hyperparams": {
"opt_base_learning_rate": 3e-5
},
"Epochs to converge": [
1258291200, 1258291200, 1308622848, 1258291200, 1258291200, 1258291200
]
},

"gpt3_ref_8192":
{
"Benchmark": "gpt3",
"Creator": "Google & NVIDIA",
"When": "Prior to 3.0 submission",
"Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM",
"BS": 8192,
"Hyperparams": {
"opt_base_learning_rate": 3e-5
},
"Epochs to converge": [
1610612736, 1660944384, 1660944384, 1610612736, 1610612736, 1610612736
]
}

}
14 changes: 7 additions & 7 deletions mlperf_logging/result_summarizer/result_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,9 +593,6 @@ def get_parser():
def main():
parser = get_parser()
args = parser.parse_args()
config_path = os.path.join(os.path.dirname(__file__), "xlsx_config.yaml")
with open(config_path, "r") as f:
config = yaml.safe_load(f)

strong_scaling_summaries = []
weak_scaling_summaries = []
Expand Down Expand Up @@ -632,15 +629,18 @@ def _update_summaries(folder):
# Parse results for single organization.
_update_summaries(args.folder)

def _map_availability(availability):
def _map_availability(availability, config):
map_ = config["availability"]
return map_.get(availability, availability)

def _map_columns_index(column):
def _map_columns_index(column, config):
map_ = config["columns"][args.usage][args.ruleset]
return tuple(map_.get(column, map_.get("default") + [column]))

def _summaries_to_xlsx(summaries: pd.DataFrame, path, version):
config_path = os.path.join(os.path.dirname(__file__), "xlsx_config.yaml")
with open(config_path, "r") as f:
config = yaml.safe_load(f)
writer = pd.ExcelWriter(path, engine="xlsxwriter")
index = 0
workbook = writer.book
Expand All @@ -658,12 +658,12 @@ def _summaries_to_xlsx(summaries: pd.DataFrame, path, version):
for division in ["closed", "open"]:
sheet_data = summaries[summaries["division"] == division]
sheet_data["availability"] = sheet_data["availability"].apply(
_map_availability
lambda x: _map_availability(x, config)
)
aux_df = pd.DataFrame(
[],
columns=pd.MultiIndex.from_tuples(
[_map_columns_index(c) for c in sheet_data.columns]
[_map_columns_index(c, config) for c in sheet_data.columns]
),
)
aux_df.to_excel(writer, sheet_name=division)
Expand Down
Loading

0 comments on commit 6ab1078

Please sign in to comment.