
BJJ submission Finder
Let's import the necesssry modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import wikipedia
import re
Loading the datasets
p_stats=pd.read_csv("physical_stats_data.csv",encoding='unicode_escape')
ADCC=pd.read_csv("adcc_historical_data.csv",delimiter=";")
print(ADCC.columns)
Selecting only wins by submission and merging data sets
ADCC.dropna(subset=["submission",'weight_class', 'sex'], inplace=True)
ADCC_win=ADCC[['winner_name','submission','weight_class', 'sex']]
ADCC_sub=ADCC_win.groupby("winner_name",as_index=False).agg(lambda x: pd.Series.mode(x)[0])
ADCC_sub.rename(columns={"winner_name":"Name"},inplace=True)
ADCC_final=pd.merge(ADCC_sub,p_stats, on=["Name"], how="left")
missing_vals=ADCC_final[ADCC_final["Height"].isna()]["Name"]
print(missing_vals.head())
Web-scraping for more Athlets Stats
name="gordon ryan" # Testing with Gordon rayan
def get_p_stats(name):
word =name+" grappler"
wiki_search=wikipedia.search(word)
height=float("NAN")
weight=float("NAN")
if not wiki_search:
height=float("NAN")
weight=float("NAN")
else:
try:
wiki_page=wikipedia.page(wiki_search[0])
except:
return height,weight
url=wiki_page.url
url_open = requests.get(url)
soup=BeautifulSoup(url_open.content,"html.parser")
details=soup('table',{'class':'infobox'})
p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'
for i in details:
h=i.find_all('tr')
for j in h:
heading=j.find_all('th')
detail=j.find_all('td')
if heading is not None and detail is not None:
for x,y in zip(heading,detail):
# Getting The height
if x.text == "Height":
nums=[]
idx_m=y.text.find("m")
if idx_m != -1:
if re.search(p, y.text) is not None:
for catch in re.finditer(p, y.text):
nums.append(catch[0])
if len(nums)>1:
idx_imp=y.text.find("ft")
if idx_imp < idx_m:
height=float(nums[2])
if height < 10:
height=height*100
else:
height=float(nums[0])
if height < 10:
height=height*100
# Getting The weight
if x.text == "Weight":
# print(y.text)
nums=[]
idx_m=y.text.find("kg")
if idx_m != -1:
if re.search(p, y.text) is not None:
for catch in re.finditer(p, y.text):
nums.append(catch[0])
if len(nums)>1:
idx_imp=y.text.find("lb")
if idx_imp < idx_m:
weight=float(nums[1])
else:
weight=float(nums[0])
return height,weight
res=get_p_stats(name)
print(res)
Looking for the missing stats
height=[]
weight=[]
missing_athletes=[]
name_list=missing_vals.tolist()
# name_list=["Abraham Marte Messina"]
for name in name_list:
# print(name)
try:
h,w=get_p_stats(name)
height.append(h)
weight.append(w)
except:
height.append(float("NAN"))
weight.append(float("NAN"))
missing_athletes.append(name)
print(missing_athletes)
Creating a result datafrme and saving it
df=pd.DataFrame({"name":name_list, "height":height,"weight":weight})
df.to_csv("Data/wiki_results") #saving the results
print(df.isna().sum())
Let's check the stats
df_missing=df[df["height"].isna()]
print(df.describe())
Wikipedia is useless let's try google search
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import os
from dotenv import load_dotenv
_ = load_dotenv()
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import random
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),options = chrome_options)
# Getting rid of the cookies pop ups
# name='gordon ryan'
name='Craig Jones'
query=name.replace(" ","+")+"+bjj"+"+height"
base_url="https://www.google.com/search?q="
url=base_url+query
data = driver.get(url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//button[@id="L2AGLb"]'))).click()
def exctract_digit(str):
p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'
num=float(re.findall(p, str)[0])
return num
def Extract_Result_From_Answer(driver,key):
res=driver.find_element(By.XPATH,'//div[@class="'+key+'"]').text
res=exctract_digit(res)
return res
def extract_Result_From_Table(driver,unit):
res=driver.find_element(By.XPATH,'//div[@class="webanswers-webanswers_table__webanswers-table"]').find_element(By.TAG_NAME,'table').find_element(By.XPATH,("//*[contains(text(),"+unit+")]")).text
res=exctract_digit(res)
return(res)
def extract_height(driver,name):
base_url="https://www.google.com/search?q="
# sufixes=['bjj','grappler','ADCC','IBJJF','Brazilian Jiu-Jitsu','Brazilian Jiu Jitsu','Jiu-Jitsu']
sufixes=['BJJ']
keys=['Z0LcW t2b5Cf CfV8xf','Z0LcW t2b5Cf']
for key in keys:
for i in sufixes:
query=name.replace(" ","+")+'+'+i+"+height"
url=base_url+query
driver.get(url)
try:
h=Extract_Result_From_Answer(driver,key)
if h<10:
h=h*100
return h
except:
pass
try:
time.sleep(random.randint(1,5))
h=extract_Result_From_Table(driver,' cm')
if h<100:
h=h*100
return h
except:
pass
try:
time.sleep(random.randint(1,5))
h=extract_Result_From_Table(driver,' m')
if h<10:
h=h*100
return h
except:
pass
return float("NAN")
def extract_weight(driver,name):
base_url="https://www.google.com/search?q="
# sufixes=['bjj','grappler','ADCC','IBJJF']
key='Z0LcW t2b5Cf'
sufixes=['BJJ']
for i in sufixes:
query=name.replace(" ","+")+'+'+i+"+weight"
url=base_url+query
driver.get(url)
try:
w=Extract_Result_From_Answer(driver,key)
return w
except:
pass
try:
time.sleep(random.randint(1,5))
w=extract_Result_From_Table(driver,' kg')
return w
except:
pass
return float("NAN")
extract_weight(driver,name)
from tqdm import tqdm
import random
height=[]
weight=[]
missing_athletes_google=[]
missing_athletes=df[df["height"].isna()]['name'].tolist()
for name in tqdm(missing_athletes):
h=extract_height(driver,name)
time.sleep(random.randint(1,10))
w=extract_weight(driver,name)
height.append(h)
weight.append(w)
time.sleep(random.randint(1,10))
# print(f"{name} Height={h} Weight={w}")
if np.isnan(h) or np.isnan(w):
missing_athletes_google.append(name)
df_google=pd.DataFrame({'name':missing_athletes, 'height':height,'weight':weight })
df_google.to_csv("data/google_reults")
missing_athletes_google=df_google[df_google["height"].isna()]['name'].tolist()
It looks like The heights and weight are going to beo hard to find Let's move on and replace them with weigh category and median height for competitor for the same category
# Let's merge the results
df_google=pd.read_csv("Data/google_reults")
df_wiki=pd.read_csv("Data/wiki_results")
Data_set_final=pd.concat([df_google, df_wiki])
Data_set_final.drop(columns=['Unnamed: 0'],inplace=True)
Data_set_final.rename(columns={"name":"Name"},inplace=True)
# Adding weight category to fill in the missing weights
Data_set_final=Data_set_final.merge(ADCC_final[['Name','submission','weight_class','sex']], on=["Name"], how="left")
Let's clean up the data
fillers=Data_set_final[['height', 'weight','sex','weight_class']].groupby(['sex','weight_class']).mean()
print(fillers)
# it looks like we don't have the average weight for female grapples for -60KG Let's just replace it with 60
fillers.fillna(60,inplace=True)
# Lets's fill up
Data_set_final['height'].fillna(Data_set_final[['height','sex','weight_class']].groupby(['sex','weight_class'])['height'].transform('mean'),inplace=True)
Data_set_final['weight'].fillna(Data_set_final[['weight','sex','weight_class']].groupby(['sex','weight_class'])['weight'].transform('mean'),inplace=True)
print(Data_set_final[Data_set_final['weight'].isna()][['sex','weight_class']])
# All Nan are in the female -60KG Lets fill it up with 60KG
Data_set_final['weight'].fillna(60,inplace=True)
Data_set_final.to_csv('Data/Data_set_final')
Data_set_final=pd.read_csv('Data/Data_set_final',index_col=[0])
Data_set_final['gender']=(Data_set_final['sex']=='M')
Data_set_clean=Data_set_final
count=dict(Data_set_clean['submission'].value_counts())
Data_set_clean['sub_count']=Data_set_clean['submission']
Data_set_clean['sub_count']=Data_set_clean['submission'].map(count)
print(Data_set_clean.head())
Data_set_clean.drop(Data_set_clean[Data_set_clean['sub_count']<10].index,inplace=True)
Data_set_clean.drop(Data_set_clean[Data_set_clean['submission']=='Submission'].index,inplace=True)
Data_set_clean.drop(columns=['sub_count'],inplace=True)
Data_set_clean.to_csv('Data/data_set_training')
# Let's get training
from sklearn.model_selection import ShuffleSplit, train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score
Data_set_clean=pd.read_csv('Data/data_set_training',index_col=[0])
features=features=['height','weight','gender']
X=Data_set_clean[features].to_numpy()
y=Data_set_clean['submission']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sns.pairplot(Data_set_clean,hue='submission',vars=features,kind='scatter')
Training a knn classifier
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
k_values = [i for i in range (1,32)]
scores = []
ss = ShuffleSplit(n_splits=10, test_size=0.1, random_state=1)
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
score = cross_val_score(knn, X_train_s, y_train, cv=ss)
scores.append(np.mean(score))
sns.lineplot(x = k_values, y = scores, marker = 'o')
plt.xlabel("K Values")
plt.ylabel("Accuracy Score")
best_index = np.argmax(scores)
best_k = k_values[best_index]
knn = KNeighborsClassifier(n_neighbors=best_k)
X_s=scaler.transform(X)
knn.fit(X_s, y)
y_pred = knn.predict(X_test_s)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='weighted',zero_division=0.0)
recall = recall_score(y_test, y_pred,average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
Training a descision tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
scores = []
clf = DecisionTreeClassifier(criterion='entropy')
score = cross_val_score(clf, X_train, y_train, cv=ss)
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X, y)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='weighted',zero_division=0.0)
recall = recall_score(y_test, y_pred,average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
Ploting the tree
plot_tree(clf,feature_names=features,class_names=Data_set_clean['submission'].unique().tolist(),filled=True)
# plt.savefig('imgs/out.pdf')
# saving the models
import pickle
knnPickle = open('models/knnpickle_file', 'wb')
pickle.dump(knn, knnPickle)
knnPickle.close()
treePickle=open('models/treepickle_file', 'wb')
pickle.dump(clf, treePickle)
treePickle.close()
scalerPickle=open('models/scalerpickle_file','wb')
pickle.dump(scaler, scalerPickle)
scalerPickle.close()
Building the interface To run the app run the command streamlit run app
import numpy as np
import pandas as pd
import streamlit as st
import pickle
from streamlit.connections import SQLConnection
from sqlalchemy.sql import text
from datetime import timedelta
def main():
clf = pickle.load(open('models/treepickle_file', 'rb'))
# Initialize connection.
conn = st.connection("postgresql", type="sql")
st.title("Submission Finder")
html_temp = """
<h2 style="color:white;text-align:center;">Submission Finder </h2>
</div>
"""
st.markdown(html_temp, unsafe_allow_html = True)
gender = st.selectbox("Gender",["Female","Male"])
weight = st.text_input("Weight","95")
height= st.text_input("Height","179")
fav_sub=st.text_input("High percentage submission")
if st.button("Predict"):
data = {'height': int(height), 'weight': int(weight),'gender':gender }
df=pd.DataFrame([list(data.values())], columns=[['height', 'weight','gender']])
df['gender_num']=df['gender']=='Male'
features=['height','weight','gender_num']
X=df[features].to_numpy()
# X=scaler.transform(X)
y=clf.predict(X)
with conn.session as s:
s.execute(text('INSERT INTO physical_stats (height, weight, gender, submission, predsub) VALUES (:height, :weight, :gender, :submission, :predsub);'),
params=dict(height=height, weight=weight, gender=gender, submission=fav_sub, predsub=y[0]))
s.commit()
st.success('Submission is {}'.format(y[0]))
if __name__=='__main__':
main()
I leveraged my data science expertise to create a web application predicting optimal Brazilian Jiu-Jitsu (BJJ) submissions based on body type. To achieve this, I first collected data on successful submissions from ADCC tournaments using web scraping. Then I trained a decision tree model using data from the prestigious ADCC competition. This model analyzes an individual's body type and recommends the most effective submission techniques. I then built a user-friendly web application hosted on the secure and scalable Amazon Web Services (AWS) platform. The application interacts with a PostgreSQL database, also hosted on AWS, which stores the user input and the model's recommendations. This project combines data science techniques with cloud computing to provide BJJ enthusiasts with a personalized approach to submission selection. You can download all the necessary files from GitHub or follow along in the Jupyter Notebook below: