Back to Article
graphs.ipynb
Download Notebook
In [2]:

import boto3
import pandas as pd
from io import StringIO

def load_all_csvs_from_s3(bucket_name):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)

    dfs = []

    for obj in bucket.objects.all():
        if obj.key.endswith('.csv') and obj.key.startswith('data'):
            data = obj.get()['Body'].read().decode('utf-8')
            df = pd.read_csv(StringIO(data))
            dfs.append(df)

    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

df = load_all_csvs_from_s3('dipy-parallel-tests')

df.sort_values(by='cpu_count', inplace=True)
In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

shapes = df['data_shape'].unique()

for shape in shapes:
    shape_df = df[df['data_shape'] == shape]
    models = shape_df['model'].unique()

    for model in models:
        model_df = shape_df[shape_df['model'] == model]

        cpu_counts = model_df['cpu_count'].unique()

        fig, ax = plt.subplots(figsize=(10, 10))

        for cpu_count in cpu_counts:
            cpu_df = model_df[model_df['cpu_count'] == cpu_count]

            # Calculate the average time of the serial runs
            serial_avg_time = cpu_df[cpu_df['engine'] == 'serial']['time'].mean()

            # Group by 'num_chunks' and calculate the mean and std of 'time'
            grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()

            # Express the time of the ray runs as a percentage of the average serial time
            grouped_df['mean'] = grouped_df['mean'].apply(lambda x: (x / serial_avg_time) * 100 if serial_avg_time else 0)

            ax.errorbar(grouped_df['num_chunks'], grouped_df['mean'], yerr=grouped_df['std'], label=f'{cpu_count} CPUs', fmt='-o')

        ax.set_xscale('log')
        ax.set_ylim(bottom=0)

        ax.set_title(f'Model Type {model}, Shape {shape} (Average Time as Percentage of Serial Time with Error Bars)')
        ax.set_xlabel('num_chunks (log scale)')
        ax.set_ylabel('Average Time as Percentage of Serial Time')
        ax.legend()  # Add a legend
        plt.show()

for shape in shapes:
    shape_df = df[df['data_shape'] == shape]
    models = shape_df['model'].unique()

    for model in models:
        model_df = shape_df[shape_df['model'] == model]
        cpu_counts = model_df['cpu_count'].unique()

        fig, ax = plt.subplots(figsize=(10, 10))

        for cpu_count in cpu_counts:
            cpu_df = model_df[model_df['cpu_count'] == cpu_count]

            # Calculate the average time of the serial runs
            serial_avg_time = cpu_df[cpu_df['engine'] == 'serial']['time'].mean()

            # Group by 'num_chunks' and calculate the mean and std of 'time'
            grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()

            # Calculate the speedup as the ratio of the serial time to the ray time
            grouped_df['mean'] = grouped_df['mean'].apply(lambda x: serial_avg_time / x if x else 0)

            ax.errorbar(grouped_df['num_chunks'], grouped_df['mean'], yerr=grouped_df['std'], label=f'{cpu_count} CPUs', fmt='-o')

        ax.set_xscale('log')
        ax.set_ylim(bottom=0)

        ax.set_title(f'Model Type {model}, Shape {shape} (Speedup over Serial Time with Error Bars)')
        ax.set_xlabel('num_chunks (log scale)')
        ax.set_ylabel('Speedup over Serial Time')
        ax.legend()  # Add a legend
        plt.show()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/392492796.py:58: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean, np.std]).reset_index()

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

shapes = df['data_shape'].unique()

for shape in shapes:
    shape_df = df[df['data_shape'] == shape]
    models = shape_df['model'].unique()

    for model in models:
        model_df = shape_df[shape_df['model'] == model]
        cpu_counts = model_df['cpu_count'].unique()

        fig, ax = plt.subplots(figsize=(10, 10))

        for cpu_count in cpu_counts:

            cpu_df = model_df[model_df['cpu_count'] == cpu_count]

            serial_avg_time = cpu_df[cpu_df['engine'] == 'serial']['time'].mean()

            cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count

            grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()

            ax.errorbar(grouped_df['num_chunks'], grouped_df['mean'], yerr=grouped_df['std'], label=f'{cpu_count} CPUs', fmt='-o')

        ax.set_xscale('log')
        ax.set_ylim(bottom=0)

        ax.set_title(f'Model Type {model}, Shape {shape} (Efficency per Core with Error Bars)')
        ax.set_xlabel('num_chunks (log scale)')
        ax.set_ylabel('Efficency per Core')
        ax.legend()  # Add a legend
        plt.show()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cpu_df['efficency'] = (serial_avg_time / cpu_df['time']) / cpu_count
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function mean at 0x10636d260> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_99130/2647551254.py:25: FutureWarning: The provided callable <function std at 0x10636d3a0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
  grouped_df = cpu_df.groupby('num_chunks')['efficency'].agg([np.mean, np.std]).reset_index()

In [5]:
fig, ax = plt.subplots(figsize=(10, 10))

for cpu_count in cpu_counts:
    cpu_df = model_df[model_df['cpu_count'] == cpu_count]

    # Calculate the average time of the serial runs for each number of chunks
    serial_avg_time = cpu_df[cpu_df['engine'] == 'serial'].groupby('num_chunks')['time'].mean()

    # Calculate the average time for the current cpu_count for each number of chunks
    avg_time = cpu_df.groupby('num_chunks')['time'].mean()

    # Calculate efficiency per core for each number of chunks
    efficiency = (serial_avg_time / avg_time) / cpu_count

    # Plot efficiency per core
    ax.errorbar(efficiency.index, efficiency.values, label=f'{cpu_count} CPUs', fmt='-o')

ax.set_xlabel('Number of chunks')
ax.set_ylabel('Efficiency per core')
ax.legend()
plt.show()

In [6]:
# Calculate the efficiency per core as the ratio of the serial time to the ray time divided by the number of CPUs
grouped_df['mean'] = grouped_df.apply(lambda row: (serial_avg_time / row['mean']) / row['cpu_count'] if row['mean'] else 0, axis=1)
KeyError: 'cpu_count'
In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

shapes = df['data_shape'].unique()

for shape in shapes:
    shape_df = df[df['data_shape'] == shape]
    models = shape_df['model'].unique()

    for model in models:
        model_df = shape_df[shape_df['model'] == model]
        cpu_counts = model_df['cpu_count'].unique()

        fig, ax = plt.subplots(figsize=(10, 10))

        for cpu_count in cpu_counts:
            cpu_df = model_df[model_df['cpu_count'] == cpu_count]

            # Group by num_chunks and calculate the mean time
            grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()

            # Find the num_chunks with the minimum mean time
            fastest_num_chunks = grouped_df.loc[grouped_df['mean'].idxmin(), 'num_chunks']

            # Filter the dataframe to include only the rows with the fastest num_chunks
            fastest_df = cpu_df[cpu_df['num_chunks'] == fastest_num_chunks]

            # Calculate the mean time for the fastest num_chunks
            mean_time = fastest_df['time'].mean()

            ax.plot(cpu_count, mean_time, 'o')

        ax.set_title(f'Model Type {model}, Shape {shape} (Average Time for Fastest num_chunks)')
        ax.set_xlabel('cpu_count')
        ax.set_ylabel('Average Time')
        plt.show()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_20072/744342013.py:21: FutureWarning: The provided callable <function mean at 0x121c2cd60> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  grouped_df = cpu_df.groupby('num_chunks')['time'].agg([np.mean]).reset_index()