ZebraLogic / eval_utils.py
yuchenlin's picture
add truth data viewer
c1a5b93
raw
history blame
No virus
8.94 kB
import json
from collections import defaultdict
import os
from tabulate import tabulate
from datasets import load_dataset
private_solutions = {}
def load_private_solutions():
global private_solutions
private_zebra_data = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", split="test")
for item in private_zebra_data:
private_solutions[item["id"]] = item["solution"]
return
def load_model_results(run_name_folders):
model_results = {}
for run_name, folder in run_name_folders.items():
# iterate all json files under the folder
for filename in os.listdir(folder):
filepath = os.path.join(folder, filename)
if not filename.endswith(".json"):
continue
model_name = filename.replace(".json", "")
model_name = f"{model_name}%{run_name}"
model_results[model_name] = filepath
return model_results
def extract_last_complete_json(s):
# Stack to keep track of opening and closing braces
stack = []
last_json_start = None
last_json_str = None
for i, char in enumerate(s):
if char == '{':
stack.append(i)
if last_json_start is None:
last_json_start = i
elif char == '}':
if stack:
start = stack.pop()
if not stack:
# Complete JSON object found
last_json_str = s[last_json_start:i+1]
last_json_start = None
# Load the last JSON object
if last_json_str:
try:
return json.loads(last_json_str.replace("\n", ""))
except json.JSONDecodeError:
pass
return None
def eval_each_puzzle(id, prediction_table):
global private_solutions
if not private_solutions:
load_private_solutions()
solution = private_solutions[id]
solution_table = {}
num_houses = len(solution["rows"])
columns = solution["header"]
assert columns[0] == "House"
solution_table = {}
this_total_cells = 0
for i in range(num_houses):
solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
this_total_cells += len(columns) - 1
this_correct_cells = 0 # number in the solution_table
for house in solution_table:
for column in solution_table[house]:
# if prediction_table[house][column] not exist then pass
if house in prediction_table and column in prediction_table[house]:
truth_cell = solution_table[house][column].lower().strip()
if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
continue
if type(prediction_table[house][column]) == list:
predicted_cell = prediction_table[house][column][0].lower().strip()
elif type(prediction_table[house][column]) == str:
predicted_cell = prediction_table[house][column].lower().strip()
if truth_cell == predicted_cell:
this_correct_cells += 1
return this_total_cells, this_correct_cells, private_solutions[id]
def eval_model(model, filepath):
global private_solutions
with open(filepath, "r") as f:
print(f"Processing {filepath}")
data = json.load(f)
solved_puzzles = 0
num_total_puzzles = len(data)
correct_cells = 0
total_cells = 0
no_asnwer = 0
num_total_puzzles_by_size = defaultdict(int)
solved_puzzles_by_size = defaultdict(int)
reason_lens = []
for item in data:
# solution = item["solution"]
solution = private_solutions[item["id"]]
size = item["size"]
num_total_puzzles_by_size[size] += 1
# Process the solution
solution_table = {}
num_houses = len(solution["rows"])
columns = solution["header"]
assert columns[0] == "House"
solution_table = {}
this_total_cells = 0
for i in range(num_houses):
solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
this_total_cells += len(columns) - 1
total_cells += this_total_cells
# Read and Parse the prediction from model output
prediction_str = item["output"][0]
prediction_json = extract_last_complete_json(prediction_str)
if prediction_json is None or "solution" not in prediction_json:
# print("-"*100)
# prediction_str = prediction_str.replace("\n", "")
# print([prediction_str])
# json.loads(prediction_str)
no_asnwer += 1
# print(item["id"])
continue
reason = prediction_json.get("reasoning", "")
prediction_table = prediction_json["solution"]
reason_lens.append(len(reason))
this_correct_cells = 0 # number in the solution_table
for house in solution_table:
for column in solution_table[house]:
# if prediction_table[house][column] not exist then pass
if house in prediction_table and column in prediction_table[house]:
truth_cell = solution_table[house][column].lower().strip()
if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
continue
if type(prediction_table[house][column]) == list:
predicted_cell = prediction_table[house][column][0].lower().strip()
elif type(prediction_table[house][column]) == str:
predicted_cell = prediction_table[house][column].lower().strip()
else:
raise ValueError(f"Unknown type: {type(prediction_table[house][column])}")
if truth_cell == predicted_cell:
this_correct_cells += 1
correct_cells += this_correct_cells
# compute puzzle success rate
if this_correct_cells == this_total_cells:
solved_puzzles += 1
solved_puzzles_by_size[size] += 1
# # print the success rate by size; order the dict by size first
sizes = sorted(num_total_puzzles_by_size.keys())
easy_sizes = ['2*2', '2*3', '2*4', '2*5', '2*6', '3*2', '3*3',]
hard_sizes = ['3*4', '3*5', '4*2', '3*6', '4*3', '4*4', '5*2', '6*2', '4*5', '4*6', '5*3', '5*4', '5*5', '5*6', '6*3', '6*4', '6*5', '6*6']
easy_solved_puzzles = sum([solved_puzzles_by_size[size] for size in easy_sizes])
easy_total_puzzles = sum([num_total_puzzles_by_size[size] for size in easy_sizes])
hard_solved_puzzles = sum([solved_puzzles_by_size[size] for size in hard_sizes])
hard_total_puzzles = sum([num_total_puzzles_by_size[size] for size in hard_sizes])
# for size in sizes:
# print(f"Size {size}: {solved_puzzles_by_size[size]}/{num_total_puzzles_by_size[size]} -> {solved_puzzles_by_size[size]/num_total_puzzles_by_size[size]*100:.2f}%")
result = {}
result["Model"] = model.split("%")[0]
result["Mode"] = model.split("%")[1]
result["Puzzle Acc"] = f"{solved_puzzles/num_total_puzzles*100:.2f}"
result["Cell Acc"] = f"{correct_cells/total_cells*100:.2f}"
result["No answer"] = f"{no_asnwer/num_total_puzzles*100:.2f}"
result["Easy Puzzle Acc"] = f"{easy_solved_puzzles/easy_total_puzzles*100:.2f}"
result["Hard Puzzle Acc"] = f"{hard_solved_puzzles/hard_total_puzzles*100:.2f}"
result["Total Puzzles"] = num_total_puzzles
result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}"
return result
def gen_results(run_name_folders):
model_results = load_model_results(run_name_folders)
columns = ["Model", "Mode", "Puzzle Acc", "Cell Acc", "No answer", "Easy Puzzle Acc", "Hard Puzzle Acc", "Total Puzzles", "Reason Lens"]
rows = []
for model_name, filepath in model_results.items():
result = eval_model(model_name, filepath)
rows.append(result)
# sort the rows by puzzle accuracy
rows = sorted(rows, key=lambda x: -float(x["Puzzle Acc"]))
# Convert rows to the expected format for tabulate
table_data = [[row[col] for col in columns] for row in rows]
print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
# print(tabulate(rows, headers=columns, tablefmt="github"))
# write to json file
with open("result_dirs/zebra-grid.summary.json", "w") as f:
json.dump(rows, f, indent=2)
if __name__ == "__main__":
run_name_folders = {
"greedy": "result_dirs/zebra-grid",
"sampling": "result_dirs/zebra-grid/sampling",
}
load_private_solutions()
gen_results(run_name_folders)