|
| 1 | +import argparse |
| 2 | +import os |
| 3 | +os.environ[ |
| 4 | + "REDIS_OM_URL" |
| 5 | +] = "redis://:password@server_name:port_num" |
| 6 | +from sotopia.database.logs import EpisodeLog |
| 7 | +from sotopia.database.persistent_profile import EnvironmentProfile |
| 8 | +from sotopia.database.persistent_profile import AgentProfile |
| 9 | +import json |
| 10 | +import numpy as np |
| 11 | + |
| 12 | +# tag = "pilot-2_checkpoint_improve-0_epoch-3_gpt-3.5-turbo_dev" |
| 13 | +# target_model = "custom_model" |
| 14 | + |
| 15 | +# hard_envs = ["01HJPQ34Y3S1TDPTRX1CCH6VPG", "01HJPQ34ZG9WZEDX6BV5QZB1QG"] |
| 16 | + |
| 17 | +def gen_target_result_dict(envs: list, tag: str, target_model: str)->dict: |
| 18 | + target_result_by_env = [] |
| 19 | + for env_profile_id in envs: |
| 20 | + |
| 21 | + env = EnvironmentProfile.get(env_profile_id) |
| 22 | + target_result_dict = {"env_profile_id": env_profile_id, |
| 23 | + "scenario": env.scenario, |
| 24 | + "target_as_agent_1": {}, |
| 25 | + "target_as_agent_2": {} |
| 26 | + } |
| 27 | + |
| 28 | + target_result_dict["target_as_agent_1"] = { |
| 29 | + "agent_env_goal": env.agent_goals[0], |
| 30 | + "agent_performance_by_profile": [] |
| 31 | + } |
| 32 | + |
| 33 | + target_result_dict["target_as_agent_2"] = { |
| 34 | + "agent_env_goal": env.agent_goals[1], |
| 35 | + "agent_performance_by_profile": [] |
| 36 | + } |
| 37 | + |
| 38 | + eps = list(EpisodeLog.find(EpisodeLog.tag == tag, |
| 39 | + EpisodeLog.environment == env_profile_id)) |
| 40 | + |
| 41 | + for i in range(len(eps)): |
| 42 | + if eps[i].models[1] == target_model: # target as agent 1 |
| 43 | + |
| 44 | + agent_id = eps[i].agents[0] |
| 45 | + agent_profile = list(AgentProfile.find( |
| 46 | + AgentProfile.pk == agent_id))[0] |
| 47 | + agent_first_name, agent_last_name = agent_profile.first_name, agent_profile.last_name |
| 48 | + agent_performance_dict = { |
| 49 | + "agent_profile_id": agent_id, |
| 50 | + "agent_first_name": agent_first_name, |
| 51 | + "agent_last_name": agent_last_name, |
| 52 | + "reward": eps[i].rewards[0], |
| 53 | + "reasoning": eps[i].reasoning |
| 54 | + } |
| 55 | + target_result_dict["target_as_agent_1"]["agent_performance_by_profile"].append( |
| 56 | + agent_performance_dict) |
| 57 | + |
| 58 | + if eps[i].models[2] == target_model: |
| 59 | + agent_id = eps[i].agents[1] |
| 60 | + agent_profile = list(AgentProfile.find( |
| 61 | + AgentProfile.pk == agent_id))[0] |
| 62 | + agent_first_name, agent_last_name = agent_profile.first_name, agent_profile.last_name |
| 63 | + agent_performance_dict = { |
| 64 | + "agent_profile_id": agent_id, |
| 65 | + "agent_first_name": agent_first_name, |
| 66 | + "agent_last_name": agent_last_name, |
| 67 | + "reward": eps[i].rewards[1], |
| 68 | + "reasoning": eps[i].reasoning |
| 69 | + } |
| 70 | + target_result_dict["target_as_agent_2"]["agent_performance_by_profile"].append( |
| 71 | + agent_performance_dict) |
| 72 | + |
| 73 | + target_result_by_env.append(target_result_dict) |
| 74 | + |
| 75 | + return target_result_by_env |
| 76 | + |
| 77 | + |
| 78 | +def eval_average(target_result_by_env: dict, tag: str)->dict: |
| 79 | + avg_dict = { |
| 80 | + "believability": 0.0, |
| 81 | + "relationship": 0.0, |
| 82 | + "knowledge": 0.0, |
| 83 | + "secret": 0.0, |
| 84 | + "social_rules": 0.0, |
| 85 | + "financial_and_material_benefits": 0.0, |
| 86 | + "goal": 0.0, |
| 87 | + "overall_score": 0.0 |
| 88 | + } |
| 89 | + |
| 90 | + eps = list(EpisodeLog.find(EpisodeLog.tag == tag)) |
| 91 | + |
| 92 | + for result_dict in target_result_by_env: |
| 93 | + for key in avg_dict: |
| 94 | + if len(result_dict["target_as_agent_1"]["agent_performance_by_profile"]) == 0: |
| 95 | + perf_as_agent_1 = 0 |
| 96 | + else: |
| 97 | + perf_as_agent_1 = np.sum([ |
| 98 | + agent_profile["reward"][1][key] for agent_profile in result_dict["target_as_agent_1"]["agent_performance_by_profile"]]) |
| 99 | + if len(result_dict["target_as_agent_2"]["agent_performance_by_profile"]) == 0: |
| 100 | + perf_as_agent_2 = 0 |
| 101 | + else: |
| 102 | + perf_as_agent_2 = np.sum([ |
| 103 | + agent_profile["reward"][1][key] for agent_profile in result_dict["target_as_agent_2"]["agent_performance_by_profile"]]) |
| 104 | + # print(len(result_dict["target_as_agent_1"]["agent_performance_by_profile"])) |
| 105 | + # print(len(result_dict["target_as_agent_2"]["agent_performance_by_profile"])) |
| 106 | + # avg_dict[key] += (perf_as_agent_1 + perf_as_agent_2) / 2 / len(target_result_by_env) |
| 107 | + avg_dict[key] += (perf_as_agent_1 + perf_as_agent_2) / len(eps) |
| 108 | + # avg_dict[key] += (perf_as_agent_1 + perf_as_agent_2) / 14 |
| 109 | + |
| 110 | + return avg_dict |
| 111 | + |
| 112 | + |
| 113 | +def main(): |
| 114 | + parser = argparse.ArgumentParser() |
| 115 | + parser.add_argument("--tag", type=str, required=True) |
| 116 | + parser.add_argument("--target-model", type=str, default="custom_model") |
| 117 | + parser.add_argument("--env-ids-tag", type=str, required=True) |
| 118 | + parser.add_argument("--out-dir", type=str, required=True) |
| 119 | + args = parser.parse_args() |
| 120 | + |
| 121 | + with open("resources/env_ids.json", 'r') as f: |
| 122 | + env_dict = json.loads(f.read()) |
| 123 | + envs = env_dict[args.env_ids_tag] |
| 124 | + |
| 125 | + target_result_by_env = gen_target_result_dict(envs=envs, target_model=args.target_model, tag=args.tag) |
| 126 | + |
| 127 | + avg_dict = eval_average(target_result_by_env, tag=args.tag) |
| 128 | + |
| 129 | + if not os.path.isdir(args.out_dir): |
| 130 | + os.mkdir(args.out_dir) |
| 131 | + with open(os.path.join(args.out_dir, f"{args.tag}.json"), 'w') as f: |
| 132 | + f.write(json.dumps(avg_dict, indent=4)) |
| 133 | + with open(os.path.join(args.out_dir, f"dict.json"), 'w') as f: |
| 134 | + f.write(json.dumps(target_result_by_env, indent=4)) |
| 135 | + |
| 136 | + |
| 137 | +if __name__ == "__main__": |
| 138 | + main() |
| 139 | + |
0 commit comments