def is_success(row):
   res = (row.get("result") or "").lower()
   if res in ("resolved", "success", "pass", "passed", "correct"):
       return True
   rw = row.get("reward")
   try:
       return float(rw) >= 1.0
   except (TypeError, ValueError):
       return False
out_path = "agenttrove_clean_sft.jsonl"
kept, scanned, SCAN, KEEP = 0, 0, 1500, 200
print(f"\n⏳ Scanning up to {SCAN} rows, keeping up to {KEEP} successful traces…")
with open(out_path, "w") as f:
   for row in itertools.islice(load_dataset(REPO, split="train", streaming=True), SCAN):
       scanned += 1
       if not is_success(row):
           continue
       turns = normalize_turns(row[TRACE_KEY])
       conv = [{"from": r, "value": c} for r, c in turns if c.strip()]
       if len(conv) < 2:
           continue
       f.write(json.dumps({
           "conversations": conv,
           "source": row.get("original_source"),
           "teacher": row.get("original_teacher"),
       }) + "\n")
       kept += 1
       if kept >= KEEP:
           break
print(f"✅ Scanned {scanned} rows → wrote {kept} clean traces to '{out_path}'")
def search_traces(keyword=None, source=None, limit=3, scan=3000):
   """Stream the dataset and yield-print traces matching filters."""
   hits = 0
   for row in itertools.islice(load_dataset(REPO, split="train", streaming=True), scan):
       if source and row.get("original_source") != source:
           continue
       if keyword:
           blob = " ".join(c for _, c in normalize_turns(row[TRACE_KEY]))
           if keyword.lower() not in blob.lower():
               continue
       render_trace(row, max_chars=300)
       hits += 1
       if hits >= limit:
           break
   if hits == 0:
       print("No matches in the scanned window — try increasing `scan`.")
print("\n🔍 Searching for 'nl2bash' source traces:")
search_traces(source="nl2bash", limit=2, scan=4000)
print("\n🎉 Tutorial complete! Next ideas:")
print("   • Increase N / SCAN for bigger analyses.")
print("   • Filter by original_source (swesmith, codeforces, r2egym…) for a domain SFT set.")
print("   • Feed agenttrove_clean_sft.jsonl into Axolotl / LLaMA-Factory for fine-tuning.")
شاركها.
اترك تعليقاً