rprint(Panel.match("[bold]Baseline 1: Predict output_type from context utilizing pure Python Naive Bayes[/bold]"))
model_artifacts = {}
classifier_df = df.dropna(subset=["output_type"]).copy()
classifier_df = classifier_df[
classifier_df["output_type"].astype(str).str.len() > 0
].copy()
if classifier_df["output_type"].nunique() >= 2 and len(classifier_df) >= 30:
X_text = (
classifier_df["context"]
.fillna("")
.astype(str)
.map(lambda textual content: textual content[:12000])
.tolist()
)
y = classifier_df["output_type"].astype(str).tolist()
train_indices, test_indices = stratified_train_test_indices(y, test_size=0.2, seed=SEED)
X_train = [X_text[i] for i in train_indices]
y_train = [y[i] for i in train_indices]
X_test = [X_text[i] for i in test_indices]
y_test = [y[i] for i in test_indices]
output_type_classifier = PureMultinomialNB(
max_features=20000,
min_df=2,
alpha=1.0,
)
output_type_classifier.match(X_train, y_train)
predictions = output_type_classifier.predict(X_test)
output_type_metrics, output_report_df = evaluate_predictions(y_test, predictions)
output_matrix_df = confusion_matrix_df(y_test, predictions)
output_type_metrics["train_rows"] = len(X_train)
output_type_metrics["test_rows"] = len(X_test)
output_type_metrics["vocab_size"] = len(output_type_classifier.vocab)
rprint("[bold]Output sort classifier report:[/bold]")
show(output_report_df)
show(output_matrix_df)
output_report_df.to_csv(OUT_DIR / "output_type_classifier_report.csv", index=False)
output_matrix_df.to_csv(OUT_DIR / "output_type_confusion_matrix.csv")
top_token_records = []
for label in output_type_classifier.labels:
for token, margin in output_type_classifier.top_tokens_for_class(label, n=25):
top_token_records.append(
{
"label": label,
"token": token,
"score_margin": margin,
}
)
pd.DataFrame(top_token_records).to_csv(
OUT_DIR / "output_type_top_tokens.csv",
index=False,
)
with open(
OUT_DIR / "output_type_classifier_metrics.json",
"w",
encoding="utf-8",
) as file:
json.dump(output_type_metrics, file, ensure_ascii=False, indent=2)
model_artifacts["output_type_classifier_metrics"] = str(
OUT_DIR / "output_type_classifier_metrics.json"
)
model_artifacts["output_type_classifier_report"] = str(
OUT_DIR / "output_type_classifier_report.csv"
)
model_artifacts["output_type_confusion_matrix"] = str(
OUT_DIR / "output_type_confusion_matrix.csv"
)
model_artifacts["output_type_top_tokens"] = str(
OUT_DIR / "output_type_top_tokens.csv"
)
else:
rprint(
"[yellow]Skipping output_type classifier as a result of there are too few "
"lessons or rows.[/yellow]"
)
output_type_metrics = {}
rprint(Panel.match("[bold]Baseline 2: Predict tool_name from context utilizing pure Python Naive Bayes[/bold]"))
tool_classifier_df = df[
df["output_type"].eq("tool_use")
& df["tool_name"].fillna("").astype(str).str.len().gt(0)
].copy()
if len(tool_classifier_df) >= 50 and tool_classifier_df["tool_name"].nunique() >= 2:
top_tools = tool_classifier_df["tool_name"].value_counts().head(12).index.tolist()
tool_classifier_df["tool_label"] = tool_classifier_df["tool_name"].the place(
tool_classifier_df["tool_name"].isin(top_tools),
"__OTHER__",
)
y_tool = tool_classifier_df["tool_label"].astype(str).tolist()
X_tool_text = (
tool_classifier_df["context"]
.fillna("")
.astype(str)
.map(lambda textual content: textual content[:12000])
.tolist()
)
if len(set(y_tool)) >= 2:
train_indices, test_indices = stratified_train_test_indices(y_tool, test_size=0.2, seed=SEED)
X_train = [X_tool_text[i] for i in train_indices]
y_train = [y_tool[i] for i in train_indices]
X_test = [X_tool_text[i] for i in test_indices]
y_test = [y_tool[i] for i in test_indices]
tool_classifier = PureMultinomialNB(
max_features=20000,
min_df=2,
alpha=1.0,
)
tool_classifier.match(X_train, y_train)
tool_predictions = tool_classifier.predict(X_test)
tool_metrics, tool_report_df = evaluate_predictions(y_test, tool_predictions)
tool_matrix_df = confusion_matrix_df(y_test, tool_predictions)
tool_metrics["train_rows"] = len(X_train)
tool_metrics["test_rows"] = len(X_test)
tool_metrics["vocab_size"] = len(tool_classifier.vocab)
rprint("[bold]Device classifier report:[/bold]")
show(tool_report_df)
show(tool_matrix_df)
tool_report_df.to_csv(OUT_DIR / "tool_name_classifier_report.csv", index=False)
tool_matrix_df.to_csv(OUT_DIR / "tool_name_confusion_matrix.csv")
top_tool_token_records = []
for label in tool_classifier.labels:
for token, margin in tool_classifier.top_tokens_for_class(label, n=25):
top_tool_token_records.append(
{
"label": label,
"token": token,
"score_margin": margin,
}
)
pd.DataFrame(top_tool_token_records).to_csv(
OUT_DIR / "tool_name_top_tokens.csv",
index=False,
)
with open(
OUT_DIR / "tool_name_classifier_metrics.json",
"w",
encoding="utf-8",
) as file:
json.dump(tool_metrics, file, ensure_ascii=False, indent=2)
model_artifacts["tool_name_classifier_metrics"] = str(
OUT_DIR / "tool_name_classifier_metrics.json"
)
model_artifacts["tool_name_classifier_report"] = str(
OUT_DIR / "tool_name_classifier_report.csv"
)
model_artifacts["tool_name_confusion_matrix"] = str(
OUT_DIR / "tool_name_confusion_matrix.csv"
)
model_artifacts["tool_name_top_tokens"] = str(
OUT_DIR / "tool_name_top_tokens.csv"
)
else:
rprint("[yellow]Skipping instrument classifier as a result of labels collapsed to 1 class.[/yellow]")
tool_metrics = {}
else:
rprint(
"[yellow]Skipping instrument classifier as a result of there are too few tool-use "
"rows or instrument lessons.[/yellow]"
)
tool_metrics = {}
rprint(Panel.match("[bold]Constructing easy key phrase search helper[/bold]"))
def search_rows(key phrase, restrict=5, search_cols=("context", "cot", "completion", "text_payload")):
key phrase = str(key phrase).decrease()
masks = pd.Sequence(False, index=df.index)
for column in search_cols:
masks = masks | (
df[column]
.fillna("")
.astype(str)
.str.decrease()
.str.accommodates(re.escape(key phrase), regex=True)
)
hits = df[mask].head(restrict)
outcomes = []
for _, row in hits.iterrows():
outcomes.append(
{
"uid": row.get("uid"),
"session": row.get("session"),
"output_type": row.get("output_type"),
"tool_name": row.get("tool_name"),
"context_preview": preview_text(row.get("context"), 400),
"payload_preview": preview_text(row.get("text_payload"), 400),
}
)
return outcomes
example_queries = [
"Bash",
"Write",
"browser",
"test",
"README",
]
search_demo = {
question: search_rows(question, restrict=2)
for question in example_queries
}
with open(
OUT_DIR / "keyword_search_demo.json",
"w",
encoding="utf-8",
) as file:
json.dump(search_demo, file, ensure_ascii=False, indent=2)
rprint("[bold]Instance key phrase search outcomes:[/bold]")
rprint(safe_json_dumps(search_demo, max_chars=5000))
abstract = {
"dataset_id": DATASET_ID,
"flat_jsonl_filename": FLAT_JSONL_FILENAME,
"output_directory": str(OUT_DIR),
"repo_file_summary": file_summary,
"rows": int(len(df)),
"columns": listing(df.columns),
"output_type_distribution": (
df["output_type"]
.fillna("lacking")
.value_counts()
.to_dict()
),
"top_tools": (
df.loc[df["output_type"].eq("tool_use"), "tool_name"]
.change("", "unknown")
.value_counts()
.head(20)
.to_dict()
),
"top_source_roots": (
df["source_root"]
.fillna("unknown")
.value_counts()
.head(20)
.to_dict()
),
"length_summary": {
column: {
"imply": float(df[column].imply()),
"median": float(df[column].median()),
"p90": float(df[column].quantile(0.90)),
"p95": float(df[column].quantile(0.95)),
"max": int(df[column].max()),
}
for column in [
"context_chars",
"cot_chars",
"completion_chars",
"text_payload_chars",
]
},
"possible_secret_rows": int(df["possible_secret_anywhere"].sum()),
"plots": plot_paths,
"model_artifacts": model_artifacts,
"safe_exports": {
"practice": str(OUT_DIR / "fable5_no_cot_chat_train.jsonl"),
"validation": str(OUT_DIR / "fable5_no_cot_chat_validation.jsonl"),
"check": str(OUT_DIR / "fable5_no_cot_chat_test.jsonl"),
},
"analysis_files": {
"csv": str(OUT_DIR / "fable5_analysis_index.csv"),
"pickle": str(OUT_DIR / "fable5_analysis_index.pkl"),
"keyword_search_demo": str(OUT_DIR / "keyword_search_demo.json"),
},
}
with open(
OUT_DIR / "analysis_summary.json",
"w",
encoding="utf-8",
) as file:
json.dump(clean_for_json(abstract), file, ensure_ascii=False, indent=2, default=str)
FENCE = chr(96) * 3
report_md = (
"# Fable 5 Traces Superior Tutorial Reportnn"
"## Datasetnn"
f"- Dataset: `{DATASET_ID}`n"
f"- Flat JSONL: `{FLAT_JSONL_FILENAME}`n"
f"- Rows loaded: `{len(df):,}`n"
f"- Distinctive supply periods: `{df['session'].nunique(dropna=True):,}`n"
f"- Distinctive fashions: `{df['model'].nunique(dropna=True):,}`nn"
"## Vital security notenn"
"This tutorial treats the dataset as agent telemetry. It previews and analyzes instructions, "
"instrument calls, file edits, and transcript textual content, nevertheless it by no means executes instructions discovered inside "
"the traces.nn"
f"Potential secret-like patterns detected: `{int(df['possible_secret_anywhere'].sum()):,}` rows.n"
"Exports redact widespread API-key/token-like patterns.nn"
"## Output sort distributionnn"
f"{FENCE}jsonn"
f"{json.dumps(clean_for_json(abstract['output_type_distribution']), indent=2, ensure_ascii=False)}n"
f"{FENCE}nn"
"## Prime toolsnn"
f"{FENCE}jsonn"
f"{json.dumps(clean_for_json(abstract['top_tools']), indent=2, ensure_ascii=False)}n"
f"{FENCE}nn"
"## Saved filesnn"
"- `analysis_summary.json`n"
"- `fable5_analysis_index.csv`n"
"- `fable5_analysis_index.pkl`n"
"- `fable5_no_cot_chat_train.jsonl`n"
"- `fable5_no_cot_chat_validation.jsonl`n"
"- `fable5_no_cot_chat_test.jsonl`n"
"- plot PNG filesn"
"- baseline classifier metrics, when sufficient rows/lessons are availablenn"
"## Really useful subsequent stepsnn"
"1. Examine `fable5_no_cot_chat_train.jsonl` earlier than any fine-tuning.n"
"2. Preserve the dataset license in thoughts earlier than mannequin coaching or redistribution.n"
"3. Keep away from coaching instantly on uncooked terminal outputs with out extra privateness and security filtering.n"
"4. Begin with the no-CoT chat export except your analysis explicitly requires reasoning-trace supervision.n"
)
with open(
OUT_DIR / "REPORT.md",
"w",
encoding="utf-8",
) as file:
file.write(report_md)
rprint(
Panel.match(
f"[bold green]Tutorial full.[/bold green]nn"
f"Artifacts saved in:n{OUT_DIR}nn"
f"Key information:n"
f"- {OUT_DIR / 'REPORT.md'}n"
f"- {OUT_DIR / 'analysis_summary.json'}n"
f"- {OUT_DIR / 'fable5_no_cot_chat_train.jsonl'}n"
f"- {OUT_DIR / 'fable5_analysis_index.csv'}",
title="Executed",
)
)
show(
pd.DataFrame(
{
"artifact": [
"Report",
"Summary JSON",
"No-CoT train export",
"No-CoT validation export",
"No-CoT test export",
"Analysis CSV",
"Analysis pickle",
"Keyword search demo",
],
"path": [
str(OUT_DIR / "REPORT.md"),
str(OUT_DIR / "analysis_summary.json"),
str(OUT_DIR / "fable5_no_cot_chat_train.jsonl"),
str(OUT_DIR / "fable5_no_cot_chat_validation.jsonl"),
str(OUT_DIR / "fable5_no_cot_chat_test.jsonl"),
str(OUT_DIR / "fable5_analysis_index.csv"),
str(OUT_DIR / "fable5_analysis_index.pkl"),
str(OUT_DIR / "keyword_search_demo.json"),
],
}
)
)
Constructing a Secure Fable 5 Traces Workflow in Colab: Parsing Device Calls, Auditing Information, and Coaching Baselines
RELATED ARTICLES
