repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import argparse |
| #2 | import concurrent.futures |
| #3 | import json |
| #4 | import threading |
| #5 | from collections import defaultdict |
| #6 | |
| #7 | from metrics.llm_judge import evaluate_llm_judge |
| #8 | from metrics.utils import calculate_bleu_scores, calculate_metrics |
| #9 | from tqdm import tqdm |
| #10 | |
| #11 | |
| #12 | def process_item(item_data): |
| #13 | k, v = item_data |
| #14 | local_results = defaultdict(list) |
| #15 | |
| #16 | for item in v: |
| #17 | gt_answer = str(item["answer"]) |
| #18 | pred_answer = str(item["response"]) |
| #19 | category = str(item["category"]) |
| #20 | question = str(item["question"]) |
| #21 | |
| #22 | # Skip category 5 |
| #23 | if category == "5": |
| #24 | continue |
| #25 | |
| #26 | metrics = calculate_metrics(pred_answer, gt_answer) |
| #27 | bleu_scores = calculate_bleu_scores(pred_answer, gt_answer) |
| #28 | llm_score = evaluate_llm_judge(question, gt_answer, pred_answer) |
| #29 | |
| #30 | local_results[k].append( |
| #31 | { |
| #32 | "question": question, |
| #33 | "answer": gt_answer, |
| #34 | "response": pred_answer, |
| #35 | "category": category, |
| #36 | "bleu_score": bleu_scores["bleu1"], |
| #37 | "f1_score": metrics["f1"], |
| #38 | "llm_score": llm_score, |
| #39 | } |
| #40 | ) |
| #41 | |
| #42 | return local_results |
| #43 | |
| #44 | |
| #45 | def main(): |
| #46 | parser = argparse.ArgumentParser(description="Evaluate RAG results") |
| #47 | parser.add_argument( |
| #48 | "--input_file", type=str, default="results/rag_results_500_k1.json", help="Path to the input dataset file" |
| #49 | ) |
| #50 | parser.add_argument( |
| #51 | "--output_file", type=str, default="evaluation_metrics.json", help="Path to save the evaluation results" |
| #52 | ) |
| #53 | parser.add_argument("--max_workers", type=int, default=10, help="Maximum number of worker threads") |
| #54 | |
| #55 | args = parser.parse_args() |
| #56 | |
| #57 | with open(args.input_file, "r") as f: |
| #58 | data = json.load(f) |
| #59 | |
| #60 | results = defaultdict(list) |
| #61 | results_lock = threading.Lock() |
| #62 | |
| #63 | # Use ThreadPoolExecutor with specified workers |
| #64 | with concurrent.futures.ThreadPoolExecutor(max_workers=args.max_workers) as executor: |
| #65 | futures = [executor.submit(process_item, item_data) for item_data in data.items()] |
| #66 | |
| #67 | for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): |
| #68 | local_results = future.result() |
| #69 | with results_lock: |
| #70 | for k, items in local_results.items(): |
| #71 | results[k].extend(items) |
| #72 | |
| #73 | # Save results to JSON file |
| #74 | with open(args.output_file, "w") as f: |
| #75 | json.dump(results, f, indent=4) |
| #76 | |
| #77 | print(f"Results saved to {args.output_file}") |
| #78 | |
| #79 | |
| #80 | if __name__ == "__main__": |
| #81 | main() |
| #82 |