import boto3
import pandas as pd
from io import StringIO
def load_all_csvs_from_s3(bucket_name):
= boto3.resource('s3')
s3 = s3.Bucket(bucket_name)
bucket
= []
dfs
for obj in bucket.objects.all():
if obj.key.endswith('.csv') and obj.key.startswith('data'):
= obj.get()['Body'].read().decode('utf-8')
data = pd.read_csv(StringIO(data))
df
dfs.append(df)
= pd.concat(dfs, ignore_index=True)
final_df
return final_df
= load_all_csvs_from_s3('dipy-parallel-tests')
df
= pd.read_csv('machine_prices.csv') cost_df
In [2]:
In [3]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# quickly make a linear regression for cpus and cost
# we need this becasuse someof the tested numbers of,
# cpus dont have actualy machines available on aws
= cost_df.loc[:, ['cpus']]
x = cost_df.loc[:, ['cost']]
y
#linear regression
= PolynomialFeatures(degree=1)
poly_reg_1 = poly_reg_1.fit_transform(x)
x_poly_1 = LinearRegression()
lin_reg_1
lin_reg_1.fit(x_poly_1, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [5]:
= pd.DataFrame(columns=['model','cpus','num_chunks','time','time_std'])
avg_df = pd.DataFrame(columns=['model','time','hr_cost','total_cost','cpus'])
final_df
# fill avg_df with average time for each model, cpus, and num_chunks
for model in df['model'].unique():
= df[df['model'] == model]
model_df for cpu in model_df['cpu_count'].unique():
= model_df[model_df['cpu_count'] == cpu]
cpu_df for num_chunks in cpu_df['num_chunks'].unique():
= cpu_df[cpu_df['num_chunks'] == num_chunks]
chunk_df
= chunk_df['time'].mean()
avg_time = chunk_df['time'].std()
time_std = pd.DataFrame({'model': [model], 'cpus': [cpu], 'num_chunks': [num_chunks], 'time': [avg_time], 'time_std': [time_std]})
new_row = pd.concat([avg_df, new_row], ignore_index=True)
avg_df
for model in avg_df['model'].unique():
= avg_df[avg_df['model'] == model]
model_df for cpu in model_df['cpus'].unique():
= model_df[model_df['cpus'] == cpu]
cpu_df = cpu_df['time'].min()
time = lin_reg_1.predict(poly_reg_1.fit_transform([[cpu]]))[0][0]
cost = cost * ((time/60)/60)
total_cost
= pd.DataFrame({'model': [model], 'time': [time], 'hr_cost': [cost], 'total_cost': [total_cost], 'cpus': [cpu]})
new_row = pd.concat([final_df, new_row], ignore_index=True)
final_df
import matplotlib.pyplot as plt
print(final_df)
# Create a plot for each unique model
for model in final_df['model'].unique():
= final_df[final_df['model'] == model]
model_df = model_df.sort_values(by='cpus')
model_df 'cpus'], model_df['total_cost'], 'o-', label=model)
plt.plot(model_df[
# Set the plot title and labels
'Predicted Total Cost vs CPUs')
plt.title('# CPUs')
plt.xlabel('Total Cost (USD)')
plt.ylabel(
# Add a legend
plt.legend()
# Show the plot
plt.show()
model time hr_cost total_cost cpus
0 ConstrainedSphericalDeconvModel 11.788321 1.375766 0.004505 32
1 ConstrainedSphericalDeconvModel 11.536030 2.061250 0.006605 48
2 ConstrainedSphericalDeconvModel 11.538867 3.089476 0.009903 72
3 ConstrainedSphericalDeconvModel 22.077673 0.347540 0.002131 8
4 ConstrainedSphericalDeconvModel 14.672244 0.690282 0.002813 16
5 FreeWaterTensorModel 44.687911 1.375766 0.017078 32
6 FreeWaterTensorModel 40.760232 2.061250 0.023338 48
7 FreeWaterTensorModel 31.475660 3.089476 0.027012 72
8 FreeWaterTensorModel 106.501855 0.347540 0.010282 8
9 FreeWaterTensorModel 60.897872 0.690282 0.011677 16
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_15448/4193019617.py:15: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
avg_df = pd.concat([avg_df, new_row], ignore_index=True)
/var/folders/gb/604kkgfn2cj2q1bp1zw3t56m0000gn/T/ipykernel_15448/4193019617.py:28: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
final_df = pd.concat([final_df, new_row], ignore_index=True)