I am learning data analysis and love running, so I put the two together. Please fell free to comment and make suggestions.
All variables use the metric system.
If you use the imperial system, you can add some lines of code and convert it using PyPI: https://pypi.python.org/pypi/units/
Since my data was collected over a couple of years, some variables take that into account. It is still possible to use this notebook if you have started tracking your activities recently, you just have to adjust (change years to months or weeks, whatever your case may be) and remember that it might not be representing the 'groundtruth', because there is not enough data yet.
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime as dt
%matplotlib inline
#Load data
data = '/home/camila/Desktop/runkeeper-data-export-26662480-2017-08-15-1401/cardioActivities.csv'
df_runkeeper = pd.read_csv(data)
print(df_runkeeper.head())
#Check if any data is missing
print(df_runkeeper.isnull().sum())
#Activities types
print(df_runkeeper['Type'].value_counts().reset_index())
#Split the table, only running:
running = df_runkeeper.loc[df_runkeeper['Type'] == 'Running']
#Drop not useful columns:
running = running.drop(['Route Name','Notes'] , 1)
#Parse date:
def parse_date(date):
date = str(date)
if date == '':
return None
else:
return dt.strptime(date,'%Y-%m-%d %H:%M:%S')
running = running.assign(parsed_date = running["Date"].apply(parse_date))
#Parse duration:
def parse_duration(duration):
duration = str(duration)
if duration == '':
return None
elif len(duration) <= 5:
return dt.strptime(duration,'%M:%S')
else:
return dt.strptime(duration,'%H:%M:%S')
running = running.assign(parsed_duration = running["Duration"].apply(parse_duration))
#Parse pace
def convert2pace(speed):
pace = 60/speed
return pace
running["Average Pace"] = running["Average Speed (km/h)"].apply(convert2pace)
#Drop speed outliers
running = running[np.abs(running['Average Speed (km/h)']-running['Average Speed (km/h)'].mean())<=(3*running['Average Speed (km/h)'].std())]
#DF describe
running.describe()
# Create variables used in the "report". All variables use the *metric system*.
# If you use the imperial system, you can add some lines of code and convert it using PyPI:
# https://pypi.python.org/pypi/units/
total_distance = running["Distance (km)"].sum()
oldest = min(running["parsed_date"])
most_recent = max(running["parsed_date"])
max_duration = max(running["parsed_duration"])
delta_time = most_recent - oldest
days_over_time = delta_time / np.timedelta64(1, 'D')
years_over_time = int(days_over_time/365)
months = int((days_over_time % 365)/30)
total_distance = running["Distance (km)"].sum()
mean_distance = running["Distance (km)"].mean()
max_distance = running["Distance (km)"].max()
min_distance = running["Distance (km)"].min()
total_calories = running["Calories Burned"].sum()
total_activities = running["Date"].count()
avg_calories = total_calories/total_activities
avg_pace = running["Average Pace"].mean()
avg_speed = running["Average Speed (km/h)"].mean()
total_climb = running["Climb (m)"].sum()
avg_heart_rate = (running["Average Heart Rate (bpm)"].sum())/ (running["Average Heart Rate (bpm)"].count())
avg_climb = total_climb/total_activities
# Create strings with the variables to create the "report
str_total_run1 = "In total I ran " + str(total_activities) + " times over aprox " + str(years_over_time) + " years and " + str(months) + " months (" + str(int(days_over_time)) + " days) covering a total distance of {0:.1f}".format(total_distance) + "km."
str_total_run2 = "\nI usually run {0:.1f}".format(mean_distance) + "km each activity." + "\nThe max. distance I ran was " + str(max_distance) + "km and the min distance was " + str(min_distance) + "km. The max. duration I ran was {:%H:%M:%S}".format(max_duration) + " (Hour:Min:Sec)."
str_total_run3 = "\nDuring all this running I burnt a total of {0:.1f}".format(total_calories)+ " calories. Average of {0:.1f}".format(avg_calories) + " calories per activity."
str_total_run4 = "\nMy average pace is {0:.1f}".format(avg_pace) + " min/km and therefore my average speed is {0:.1f}".format(avg_speed) + " km/h."
str_total_run5 = "\nTotal climb was " + str(total_climb) + " meters and the average climb per activitiy was {0:.1f}".format(avg_climb) + " meters."
str_total_run6 = "\nLast but not least, average heart rate was {0:.1f}".format(avg_heart_rate) + " bpm."
overview_report = str_total_run1 + str_total_run2 + str_total_run3 + str_total_run4 + str_total_run5 + str_total_run6
print(overview_report )
#Compare average speed for the categories 5k, 10k and 21k:
around5k = running[(running["Distance (km)"] >= 4.5) & (running["Distance (km)"] <= 5.5)]
around10k = running[(running["Distance (km)"] >= 9.5) & (running["Distance (km)"] <= 10.5)]
around21k = running[(running["Distance (km)"] >= 20) & (running["Distance (km)"] <= 21.5)]
data = (around5k['Average Speed (km/h)'], around10k['Average Speed (km/h)'], around21k['Average Speed (km/h)'])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.boxplot(data)
ax.set_xlabel('Distances')
ax.set_ylabel('Avg Speed')
ax.set_title('Avg Speed vs distance categories')
ax.set_xticks = ['1','2','3']
ax.set_xticklabels(["5K", "10K", "21K"])
plt.show()
It certainly looks like I can improve my 10k pace.
#Compare number of activities per category
activities5k = around5k["parsed_date"].count()
activities10k = around10k["parsed_date"].count()
activities21k = around21k["parsed_date"].count()
data_activities = [activities5k, activities10k, activities21k]
pie_labels = ["5K", "10K", "21K"]
explode = (0.05, 0, 0)
plt.pie(data_activities, explode=explode, labels=pie_labels)
plt.title('Frequency of distances')
plt.show()
#Distance histogram
hist, bins = np.histogram(running["Distance (km)"], bins=10)
width = 0.8 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Distance")
plt.show()
#Speed histogram
hist, bins = np.histogram(running["Average Speed (km/h)"], bins= 12)
width = 0.8 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Speed")
plt.show()
#Pace histogram (I prefer to think in terms of pace, rather than speed)
hist, bins = np.histogram(running["Average Pace"], bins=16)
width = 0.8 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Pace")
plt.show()
# Creates function to create labels:
def stringer(series):
list = []
for elem in series:
list.append(str(elem))
return list
# Speed progression - distance ~5k
fig, ax = plt.subplots()
dates = around5k["parsed_date"]
my_labels = stringer(around5k["parsed_date"])
y = around5k["Average Speed (km/h)"]
x = mdates.date2num(dates.astype(dt))
m, b = np.polyfit(x, y, 1)
dates_labels = dates
ax.plot(x,y)
ax.plot(x, m*x + b)
ax.set_title("Speed progression - distance ~5k")
ax.set_ylabel('Speed')
ax.set_xlabel('Dates')
#set x - ticks
xtick_locator = mdates.AutoDateLocator()
xtick_formatter = mdates.AutoDateFormatter(xtick_locator)
ax.xaxis.set_major_locator(xtick_locator)
ax.xaxis.set_major_formatter(xtick_formatter)
fig.autofmt_xdate()
plt.show()
#Predicting speed goal achievement
y = around5k["Average Speed (km/h)"]
x = mdates.date2num(dates.astype(dt))
m, b = np.polyfit(x, y, 1)
#predicted_speed = m*x + b
x = (11 - b)/m
print(mdates.num2date(x))
I certainly can improve my 10k pace. And though I know progress is not a linear function, I am really optimistic with the prediction.
I hope you enjoyed using this notebook as much as I did creating it!
Keep running!