Feature Request: add error details summary to request file when a model fails

#935
by CombinHorizon - opened

see: https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_requests/commit/7670f45ab11e9cd328f6f0f4c11ffdf82a738693
and example of a failed model

+    "status": "FAILED",
    "submitted_time": "2024-09-09T17:52:08Z",
    "model_type": "๐Ÿ”ถ : fine-tuned/fp on domain-specific datasets",
    "source": "leaderboard",
    "job_id": 1063,
+    "job_start_time": "2024-09-10T01-40-21.109448",
+    "error_msg": "[Errno 28] No space left on device",
+    "traceback": "Traceback (most recent call last):\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/evaluate_llms.py\", line 200, in wait_download_and_run_request\n    run_request(\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/evaluate_llms.py\", line 71, in run_request\n    results = run_eval_on_model(\n              ^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/run_eval.py\", line 60, in run_eval_on_model\n    result = evaluate(\n             ^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/lm_eval_util.py\", line 145, in evaluate\n    results = evaluator.simple_evaluate(\n              ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/utils.py\", line 419, in _wrapper\n    return fn(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/evaluator.py\", line 100, in simple_evaluate\n    lm = lm_eval.api.registry.get_model(model).create_from_arg_string(\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/api/model.py\", line 134, in create_from_arg_string\n    return cls(**args, **args2)\n           ^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/models/huggingface.py\", line 304, in __init__\n    self._create_model(\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/models/huggingface.py\", line 616, in _create_model\n    self._model = self.AUTO_MODEL_CLASS.from_pretrained(\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py\", line 564, in from_pretrained\n    return model_class.from_pretrained(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/modeling_utils.py\", line 3658, in from_pretrained\n    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(\n                                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/utils/hub.py\", line 1076, in get_checkpoint_shard_files\n    for shard_filename in tqdm(shard_filenames, desc=\"Downloading shards\", disable=not show_progress_bar):\n                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/utils/logging.py\", line 361, in __call__\n    return tqdm_lib.tqdm(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/tqdm/asyncio.py\", line 24, in __init__\n    super(tqdm_asyncio, self).__init__(iterable, *args, **kwargs)\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/tqdm/std.py\", line 1097, in __init__\n    self.sp = self.status_printer(self.fp)\n              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/tqdm/std.py\", line 451, in status_printer\n    getattr(sys.stderr, 'flush', lambda: None)()\nOSError: [Errno 28] No space left on device\n"
}

maybe a length-limited summary of the error or failure, to help put more eyeballs on why a model failed, to help diagnose (hard to do it blindly)

  • (is it a problem with the model or some external factor?),

the logs help, esp if there's context e.g. if in case a lot of unrelated models fail at the same time, with the same error message (e.g. No space left on device),
thus those affected models with that error message may perhaps be restarted,
when/if the hardware/software environment is more free

maybe not need the whole log, but perhaps the main error, or a one-liner

some other log examples:

+    "status": "FAILED",
    "submitted_time": "2024-08-28T16:27:37Z",
    "model_type": "๐Ÿ”ถ : fine-tuned/fp on domain-specific datasets",
    "source": "leaderboard",
    "job_id": 1044,
+    "job_start_time": "2024-09-01T03-51-16.696093",
+    "error_msg": "batch size must be positive",
+    "traceback": "Traceback (most recent call last):\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/evaluate_llms.py\", line 200, in wait_download_and_run_request\n    run_request(\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/evaluate_llms.py\", line 71, in run_request\n    results = run_eval_on_model(\n              ^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/run_eval.py\", line 60, in run_eval_on_model\n    result = evaluate(\n             ^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/lm_eval_util.py\", line 145, in evaluate\n    results = evaluator.simple_evaluate(\n              ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/utils.py\", line 419, in _wrapper\n    return fn(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/evaluator.py\", line 159, in simple_evaluate\n    results = evaluate(\n              ^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/utils.py\", line 419, in _wrapper\n    return fn(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/evaluator.py\", line 343, in evaluate\n    resps = getattr(lm, reqtype)(cloned_reqs)\n            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/models/huggingface.py\", line 1525, in generate_until\n    cont = self._model_generate(\n           ^^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/models/huggingface.py\", line 1070, in _model_generate\n    return self.model.generate(\n           ^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/utils/_contextlib.py\", line 115, in decorate_context\n    return func(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/generation/utils.py\", line 1989, in generate\n    result = self._sample(\n             ^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/generation/utils.py\", line 2932, in _sample\n    outputs = self(**model_inputs, return_dict=True)\n              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n    return forward_call(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py\", line 1030, in forward\n    outputs = self.model(\n              ^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n    return forward_call(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py\", line 805, in forward\n    layer_outputs = decoder_layer(\n                    ^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n    return forward_call(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py\", line 546, in forward\n    hidden_states, self_attn_weights, present_key_value = self.self_attn(\n                                                          ^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n    return forward_call(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py\", line 379, in forward\n    attn_output = _flash_attention_forward(\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/modeling_flash_attention_utils.py\", line 276, in _flash_attention_forward\n    attn_output = flash_attn_varlen_func(\n                  ^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/flash_attn/flash_attn_interface.py\", line 1066, in flash_attn_varlen_func\n    return FlashAttnVarlenFunc.apply(\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/torch/autograd/function.py\", line 539, in apply\n    return super().apply(*args, **kwargs)  # type: ignore[misc]\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/flash_attn/flash_attn_interface.py\", line 581, in forward\n    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(\n                                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/flash_attn/flash_attn_interface.py\", line 86, in _flash_attn_varlen_forward\n    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(\n                                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^\nRuntimeError: batch size must be positive\n"

for this one, it seems to be a model compatibility with the benchmark setup:


+    "status": "FAILED",
    "submitted_time": "2024-08-28T16:21:56Z",
    "model_type": "๐Ÿ”ถ : fine-tuned/fp on domain-specific datasets",
    "source": "leaderboard",
    "job_id": 1043,
+    "job_start_time": "2024-09-01T03-48-21.738077",
+    "error_msg": "The checkpoint you are trying to load has model type `falcon_mamba` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.",
+    "traceback": "Traceback (most recent call last):\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py\", line 989, in from_pretrained\n    config_class = CONFIG_MAPPING[config_dict[\"model_type\"]]\n                   ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py\", line 691, in __getitem__\n    raise KeyError(key)\nKeyError: 'falcon_mamba'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/evaluate_llms.py\", line 200, in wait_download_and_run_request\n    run_request(\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/evaluate_llms.py\", line 71, in run_request\n    results = run_eval_on_model(\n              ^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/run_eval.py\", line 60, in run_eval_on_model\n    result = evaluate(\n             ^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/llm_leaderboard_eval_bot/lm_eval_util.py\", line 145, in evaluate\n    results = evaluator.simple_evaluate(\n              ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/utils.py\", line 419, in _wrapper\n    return fn(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/evaluator.py\", line 100, in simple_evaluate\n    lm = lm_eval.api.registry.get_model(model).create_from_arg_string(\n         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/api/model.py\", line 134, in create_from_arg_string\n    return cls(**args, **args2)\n           ^^^^^^^^^^^^^^^^^^^^\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/models/huggingface.py\", line 291, in __init__\n    self._get_config(\n  File \"/workspace/repos/llm_leaderboard/lm-evaluation-harness-pt/lm_eval/models/huggingface.py\", line 545, in _get_config\n    self._config = transformers.AutoConfig.from_pretrained(\n                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/root/miniconda3/envs/torch21/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py\", line 991, in from_pretrained\n    raise ValueError(\nValueError: The checkpoint you are trying to load has model type `falcon_mamba` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.\n"
}
Open LLM Leaderboard org

Hi @CombinHorizon ,

Thank you for your suggestion!

Unfortunately, we are currently unable to add this info to the request files, but we are working on our internal tool to group all failed models according to the error and resubmit them faster! We will consider whether or not we can add this information to the request files.

Let me close this discussion, please, feel free to open a new one in case of any questions!

alozowski changed discussion status to closed

Sign up or log in to comment