brew install git-lfs
git lfs install

import pandas as pd

input_datapath = "data/president_birthdays.csv"

df = pd.read_csv(input_datapath).rename(columns={" \"Name\"": "Name", " \"Month\"": "Month", " \"Day\"": "Day", " \"Year\"": "Year"}).set_index("Index")
df["text"] = df.apply(lambda r: f"{r['Name']} was born on {r['Month']}/{r['Day']}/{r['Year']}", axis=1)
display(df.head())

from openai import OpenAI
client = OpenAI()

def embed(text):
    return client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        ).data[0].embedding

df["embedding"] = df['text'].apply(embed)
df[["text", "embedding"]].to_csv("output/presidents_embeddings.csv")

# Open the file in read mode ('r')
with open('evals_registry/completion_fns/presidents.yaml', 'r') as file:
    # Read the file's content
    file_content = file.read()
    # Print the content
    print(file_content)

cot/gpt-4o-mini:
  class: evals.completion_fns.cot:ChainOfThoughtCompletionFn
  args:
    cot_completion_fn: gpt-4o-mini

retrieval/presidents/gpt-4o-mini:
  class: evals_registry.completion_fns.retrieval:MLRetrievalCompletionFn
  args:
    completion_fn: gpt-4o-mini
    embeddings_and_text_path: output/presidents_embeddings.csv
    k: 2

retrieval/presidents/cot/gpt-4o-mini:
  class: evals_registry.completion_fns.retrieval:MLRetrievalCompletionFn
  args:
    completion_fn: cot/gpt-4o-mini
    embeddings_and_text_path: output/presidents_embeddings.csv

<eval_name>:
  id: <eval_name>.dev.v0
  description: <description>
  metrics: [accuracy]

<eval_name>.dev.v0:
  class: evals.elsuite.basic.match:Match
  args:
    samples_jsonl: <eval_name>/samples.jsonl

# Open the file in read mode ('r')
with open('evals_registry/evals/older.yaml', 'r') as file:
    # Read the file's content
    file_content = file.read()
    # Print the content
    print(file_content)

older:
  id: older.dev.v0
  description: Test the model's ability to determine who is older.
  metrics: [accuracy]
older.dev.v0:
  class: evals.elsuite.basic.match:Match
  args:
    samples_jsonl: older/samples.jsonl

openai tools fine_tunes.prepare_data -f data[.csv, .json, .txt, .xlsx or .tsv]

# Open the file in read mode ('r')
with open('evals_registry/data/older/samples.jsonl', 'r') as file:
    # Loop over the first 3 lines and print each
    for i, line in enumerate(file):
        if i < 3:
            print(line, end='')  # Use end='' to avoid adding extra newlines
        else:
            break

{"input": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Was Abraham Lincoln born before Franklin Pierce? Answer Y or N."}], "ideal": "N"}
{"input": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Was Abraham Lincoln born before Andrew Johnson? Answer Y or N."}], "ideal": "N"}
{"input": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Was Andrew Jackson born before John Quincy Adams? Answer Y or N."}], "ideal": "Y"}

oaieval <completion_fn> <eval_task>

# Use gpt-4o-mini directly without any retrieval or extra prompt -> Accuracy: 0.8
!export PYTHONPATH=".:$PYTHONPATH"; oaieval gpt-4o-mini older --max_samples 10 --registry_path ./evals_registry

[2024-08-10 19:53:33,489] [registry.py:272] Loading registry from /Users/meltemseyhan/M9/MLTeam/mlteam-openai-training/evals/evals/registry/evals
[2024-08-10 19:53:34,152] [registry.py:272] Loading registry from /Users/meltemseyhan/.evals/evals
[2024-08-10 19:53:34,152] [registry.py:272] Loading registry from evals_registry/evals
[2024-08-10 19:53:34,155] [oaieval.py:215] Run started: 2408101653344UTPQZFY
[2024-08-10 19:53:34,252] [data.py:94] Fetching evals_registry/data/older/samples.jsonl
[2024-08-10 19:53:34,254] [eval.py:36] Evaluating 10 samples
[2024-08-10 19:53:34,267] [eval.py:144] Running in threaded mode with 10 threads!
100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  5.75it/s]
[2024-08-10 19:53:36,024] [oaieval.py:275] Found 10/10 sampling events with usage data
[2024-08-10 19:53:36,024] [oaieval.py:283] Token usage from 10 sampling events:
completion_tokens: 10
prompt_tokens: 316
total_tokens: 326
[2024-08-10 19:53:36,025] [record.py:371] Final report: {'accuracy': 0.8, 'boostrap_std': 0.1336369709324482, 'usage_completion_tokens': 10, 'usage_prompt_tokens': 316, 'usage_total_tokens': 326}. Logged to /tmp/evallogs/2408101653344UTPQZFY_gpt-4o-mini_older.jsonl
[2024-08-10 19:53:36,025] [oaieval.py:233] Final report:
[2024-08-10 19:53:36,025] [oaieval.py:235] accuracy: 0.8
[2024-08-10 19:53:36,025] [oaieval.py:235] boostrap_std: 0.1336369709324482
[2024-08-10 19:53:36,025] [oaieval.py:235] usage_completion_tokens: 10
[2024-08-10 19:53:36,025] [oaieval.py:235] usage_prompt_tokens: 316
[2024-08-10 19:53:36,025] [oaieval.py:235] usage_total_tokens: 326
[2024-08-10 19:53:36,031] [record.py:360] Logged 20 rows of events to /tmp/evallogs/2408101653344UTPQZFY_gpt-4o-mini_older.jsonl: insert_time=4.472ms

# Use gpt-4o-mini with retrieval -> Accuracy: 1.0
!export PYTHONPATH=".:$PYTHONPATH"; oaieval retrieval/presidents/gpt-4o-mini older --max_samples 10 --registry_path ./evals_registry

[2024-08-10 19:53:55,250] [registry.py:272] Loading registry from /Users/meltemseyhan/M9/MLTeam/mlteam-openai-training/evals/evals/registry/evals
[2024-08-10 19:53:55,798] [registry.py:272] Loading registry from /Users/meltemseyhan/.evals/evals
[2024-08-10 19:53:55,798] [registry.py:272] Loading registry from evals_registry/evals
[2024-08-10 19:53:56,312] [registry.py:272] Loading registry from /Users/meltemseyhan/M9/MLTeam/mlteam-openai-training/evals/evals/registry/completion_fns
[2024-08-10 19:53:56,320] [registry.py:272] Loading registry from /Users/meltemseyhan/.evals/completion_fns
[2024-08-10 19:53:56,320] [registry.py:272] Loading registry from evals_registry/completion_fns
[2024-08-10 19:53:56,322] [registry.py:272] Loading registry from /Users/meltemseyhan/M9/MLTeam/mlteam-openai-training/evals/evals/registry/solvers
[2024-08-10 19:53:56,500] [registry.py:272] Loading registry from /Users/meltemseyhan/.evals/solvers
[2024-08-10 19:53:56,500] [registry.py:272] Loading registry from evals_registry/solvers
[2024-08-10 19:53:56,823] [utils.py:161] NumExpr defaulting to 12 threads.
[2024-08-10 19:53:57,493] [oaieval.py:215] Run started: 240810165357I36QF4LQ
[2024-08-10 19:53:57,494] [data.py:94] Fetching evals_registry/data/older/samples.jsonl
[2024-08-10 19:53:57,495] [eval.py:36] Evaluating 10 samples
[2024-08-10 19:53:57,508] [eval.py:144] Running in threaded mode with 10 threads!
100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  6.34it/s]
[2024-08-10 19:53:59,105] [oaieval.py:275] Found 10/20 sampling events with usage data
[2024-08-10 19:53:59,105] [oaieval.py:283] Token usage from 10 sampling events:
completion_tokens: 10
prompt_tokens: 828
total_tokens: 838
[2024-08-10 19:53:59,106] [record.py:371] Final report: {'accuracy': 1.0, 'boostrap_std': 0.0, 'usage_completion_tokens': 10, 'usage_prompt_tokens': 828, 'usage_total_tokens': 838}. Logged to /tmp/evallogs/240810165357I36QF4LQ_retrieval/presidents/gpt-4o-mini_older.jsonl
[2024-08-10 19:53:59,106] [oaieval.py:233] Final report:
[2024-08-10 19:53:59,106] [oaieval.py:235] accuracy: 1.0
[2024-08-10 19:53:59,106] [oaieval.py:235] boostrap_std: 0.0
[2024-08-10 19:53:59,106] [oaieval.py:235] usage_completion_tokens: 10
[2024-08-10 19:53:59,106] [oaieval.py:235] usage_prompt_tokens: 828
[2024-08-10 19:53:59,106] [oaieval.py:235] usage_total_tokens: 838
[2024-08-10 19:53:59,114] [record.py:360] Logged 30 rows of events to /tmp/evallogs/240810165357I36QF4LQ_retrieval/presidents/gpt-4o-mini_older.jsonl: insert_time=5.668ms

# Use gpt-4o-mini with retrieval and chain-of-thought -> Accuracy: 1.0
!export PYTHONPATH=".:$PYTHONPATH"; oaieval retrieval/presidents/cot/gpt-4o-mini older --max_samples 10 --registry_path ./evals_registry

[2024-08-10 19:54:18,209] [registry.py:272] Loading registry from /Users/meltemseyhan/M9/MLTeam/mlteam-openai-training/evals/evals/registry/evals
[2024-08-10 19:54:18,790] [registry.py:272] Loading registry from /Users/meltemseyhan/.evals/evals
[2024-08-10 19:54:18,790] [registry.py:272] Loading registry from evals_registry/evals
[2024-08-10 19:54:19,351] [registry.py:272] Loading registry from /Users/meltemseyhan/M9/MLTeam/mlteam-openai-training/evals/evals/registry/completion_fns
[2024-08-10 19:54:19,356] [registry.py:272] Loading registry from /Users/meltemseyhan/.evals/completion_fns
[2024-08-10 19:54:19,356] [registry.py:272] Loading registry from evals_registry/completion_fns
[2024-08-10 19:54:19,357] [registry.py:272] Loading registry from /Users/meltemseyhan/M9/MLTeam/mlteam-openai-training/evals/evals/registry/solvers
[2024-08-10 19:54:19,516] [registry.py:272] Loading registry from /Users/meltemseyhan/.evals/solvers
[2024-08-10 19:54:19,516] [registry.py:272] Loading registry from evals_registry/solvers
[2024-08-10 19:54:19,876] [utils.py:161] NumExpr defaulting to 12 threads.
[2024-08-10 19:54:20,496] [oaieval.py:215] Run started: 240810165420X2QELTYH
[2024-08-10 19:54:20,497] [data.py:94] Fetching evals_registry/data/older/samples.jsonl
[2024-08-10 19:54:20,498] [eval.py:36] Evaluating 10 samples
[2024-08-10 19:54:20,513] [eval.py:144] Running in threaded mode with 10 threads!
100%|███████████████████████████████████████████| 10/10 [00:04<00:00,  2.33it/s]
[2024-08-10 19:54:24,816] [oaieval.py:275] Found 20/50 sampling events with usage data
[2024-08-10 19:54:24,816] [oaieval.py:283] Token usage from 20 sampling events:
completion_tokens: 1,063
prompt_tokens: 4,303
total_tokens: 5,366
[2024-08-10 19:54:24,817] [record.py:371] Final report: {'accuracy': 1.0, 'boostrap_std': 0.0, 'usage_completion_tokens': 1063, 'usage_prompt_tokens': 4303, 'usage_total_tokens': 5366}. Logged to /tmp/evallogs/240810165420X2QELTYH_retrieval/presidents/cot/gpt-4o-mini_older.jsonl
[2024-08-10 19:54:24,817] [oaieval.py:233] Final report:
[2024-08-10 19:54:24,817] [oaieval.py:235] accuracy: 1.0
[2024-08-10 19:54:24,817] [oaieval.py:235] boostrap_std: 0.0
[2024-08-10 19:54:24,817] [oaieval.py:235] usage_completion_tokens: 1063
[2024-08-10 19:54:24,817] [oaieval.py:235] usage_prompt_tokens: 4303
[2024-08-10 19:54:24,817] [oaieval.py:235] usage_total_tokens: 5366
[2024-08-10 19:54:24,830] [record.py:360] Logged 60 rows of events to /tmp/evallogs/240810165420X2QELTYH_retrieval/presidents/cot/gpt-4o-mini_older.jsonl: insert_time=10.769ms

	Name	Day	Month	Year	text
Index
1	"George Washington"	22	2	1732.0	"George Washington" was born on 2/22/1732.0
2	"John Adams"	30	10	1735.0	"John Adams" was born on 10/30/1735.0
3	"Thomas Jefferson"	13	4	1743.0	"Thomas Jefferson" was born on 4/13/1743.0
4	"James Madison"	16	3	1751.0	"James Madison" was born on 3/16/1751.0
5	"James Monroe"	28	4	1758.0	"James Monroe" was born on 4/28/1758.0

OpenAI Evals¶

Installation¶

Option 1: Use the source code¶

Option 2: Install from Python Package Index¶

Scenario: Evaluation of retriaval with different options¶

Step 1: Setup retrieval data¶

Step 2: Implement your completion function¶

Step 3: Register your completion function¶

Step 4: Build your eval¶

Step 5: Setup your sample data to use for evaluation¶

Step 6: Run the eval¶