Let's import the necesssry modules

In [ ]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import wikipedia
import re

Loading the datasets

In [ ]:

p_stats=pd.read_csv("physical_stats_data.csv",encoding='unicode_escape')
ADCC=pd.read_csv("adcc_historical_data.csv",delimiter=";")
print(ADCC.columns)

Selecting only wins by submission and merging data sets

In [ ]:

ADCC.dropna(subset=["submission",'weight_class', 'sex'], inplace=True)
ADCC_win=ADCC[['winner_name','submission','weight_class', 'sex']]
ADCC_sub=ADCC_win.groupby("winner_name",as_index=False).agg(lambda x: pd.Series.mode(x)[0])
ADCC_sub.rename(columns={"winner_name":"Name"},inplace=True)

ADCC_final=pd.merge(ADCC_sub,p_stats, on=["Name"], how="left")
missing_vals=ADCC_final[ADCC_final["Height"].isna()]["Name"]
print(missing_vals.head())

Web-scraping for more Athlets Stats

In [ ]:

name="gordon ryan"  # Testing with Gordon rayan
def get_p_stats(name):
    word =name+" grappler"

    wiki_search=wikipedia.search(word)
    height=float("NAN")
    weight=float("NAN")
    if not wiki_search:
        height=float("NAN")
        weight=float("NAN")
    else:
        try:
            wiki_page=wikipedia.page(wiki_search[0])
        except:
            return height,weight
        url=wiki_page.url
        url_open = requests.get(url)
        soup=BeautifulSoup(url_open.content,"html.parser")
        details=soup('table',{'class':'infobox'})
        p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'
        for i in details:
            h=i.find_all('tr')
            for j in h:
                heading=j.find_all('th')
                detail=j.find_all('td')
                if heading is not None and detail is not None:
                    for x,y in zip(heading,detail):
                        # Getting The height
                        if x.text == "Height":
                            nums=[]
                            idx_m=y.text.find("m")
                            if idx_m != -1:
                                if re.search(p, y.text) is not None:
                                    for catch in re.finditer(p, y.text):
                                        nums.append(catch[0])
                                
                                if len(nums)>1:
                                    idx_imp=y.text.find("ft")
                                    if idx_imp < idx_m:
                                        height=float(nums[2])
                                        if height < 10:
                                            height=height*100
                                    else:
                                        height=float(nums[0])
                                        if height < 10:
                                            height=height*100
                        
                        
                        # Getting The weight
                        if x.text == "Weight":
                            # print(y.text)
                            nums=[]
                            idx_m=y.text.find("kg")
                            if idx_m != -1:
                                if re.search(p, y.text) is not None:
                                    for catch in re.finditer(p, y.text):
                                        nums.append(catch[0])
                                if len(nums)>1:
                                    idx_imp=y.text.find("lb")
                                    if idx_imp < idx_m:
                                        weight=float(nums[1])
                                    else:
                                        weight=float(nums[0])
    return height,weight 


res=get_p_stats(name)
print(res)

Looking for the missing stats

In [ ]:

height=[]
weight=[]
missing_athletes=[]
name_list=missing_vals.tolist()
# name_list=["Abraham Marte Messina"]
for name in name_list:
    # print(name)
    try:
        h,w=get_p_stats(name)
        height.append(h)
        weight.append(w)
    except:
        height.append(float("NAN"))
        weight.append(float("NAN"))
        missing_athletes.append(name)


print(missing_athletes)

Creating a result datafrme and saving it

In [ ]:

df=pd.DataFrame({"name":name_list, "height":height,"weight":weight})
df.to_csv("Data/wiki_results") #saving the results
print(df.isna().sum())

Let's check the stats

In [ ]:

df_missing=df[df["height"].isna()]
print(df.describe())

Wikipedia is useless let's try google search

In [ ]:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import  BeautifulSoup
import requests
import os
from dotenv import load_dotenv
_ = load_dotenv()
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import random

chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),options = chrome_options)

#  Getting rid of the cookies pop ups
# name='gordon ryan'
name='Craig Jones'
query=name.replace(" ","+")+"+bjj"+"+height"
base_url="https://www.google.com/search?q="
url=base_url+query
data = driver.get(url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//button[@id="L2AGLb"]'))).click()

In [ ]:

def exctract_digit(str):
    p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'
    num=float(re.findall(p, str)[0])
    return num

def Extract_Result_From_Answer(driver,key):
    res=driver.find_element(By.XPATH,'//div[@class="'+key+'"]').text
    res=exctract_digit(res)
    return res

def extract_Result_From_Table(driver,unit):
    res=driver.find_element(By.XPATH,'//div[@class="webanswers-webanswers_table__webanswers-table"]').find_element(By.TAG_NAME,'table').find_element(By.XPATH,("//*[contains(text(),"+unit+")]")).text
    res=exctract_digit(res)
    return(res)

In [ ]:

def extract_height(driver,name):    
    base_url="https://www.google.com/search?q="
    # sufixes=['bjj','grappler','ADCC','IBJJF','Brazilian Jiu-Jitsu','Brazilian Jiu Jitsu','Jiu-Jitsu']
    sufixes=['BJJ']
    keys=['Z0LcW t2b5Cf CfV8xf','Z0LcW t2b5Cf']
    for key in keys:
        for i in sufixes:
            query=name.replace(" ","+")+'+'+i+"+height"
            url=base_url+query
            driver.get(url)
            try:
                h=Extract_Result_From_Answer(driver,key)
                if h<10:
                    h=h*100
                return h
            except:
                pass
            try:
                time.sleep(random.randint(1,5))
                h=extract_Result_From_Table(driver,' cm')
                if h<100:
                    h=h*100
                return h
            except:
                pass
            try:
                time.sleep(random.randint(1,5))
                h=extract_Result_From_Table(driver,' m')
                if h<10:
                    h=h*100
                return h
            except:
                pass
    return float("NAN")

def extract_weight(driver,name):    
    base_url="https://www.google.com/search?q="
    # sufixes=['bjj','grappler','ADCC','IBJJF']
    key='Z0LcW t2b5Cf'
    sufixes=['BJJ']
    for i in sufixes:
        query=name.replace(" ","+")+'+'+i+"+weight"
        url=base_url+query
        driver.get(url)
        try:
            w=Extract_Result_From_Answer(driver,key)
            return w
        except:
            pass
        try:
            time.sleep(random.randint(1,5))
            w=extract_Result_From_Table(driver,' kg')
            return w
        except:
            pass
    return float("NAN")


extract_weight(driver,name)

In [ ]:

from tqdm import tqdm
import random

height=[]
weight=[]
missing_athletes_google=[]
missing_athletes=df[df["height"].isna()]['name'].tolist()

for name in tqdm(missing_athletes):
    h=extract_height(driver,name)
    time.sleep(random.randint(1,10))
    w=extract_weight(driver,name)
    height.append(h)
    weight.append(w)
    time.sleep(random.randint(1,10))
    # print(f"{name} Height={h} Weight={w}")
    if np.isnan(h) or np.isnan(w):
        missing_athletes_google.append(name)

In [ ]:

df_google=pd.DataFrame({'name':missing_athletes, 'height':height,'weight':weight })
df_google.to_csv("data/google_reults")
missing_athletes_google=df_google[df_google["height"].isna()]['name'].tolist()

It looks like The heights and weight are going to beo hard to find Let's move on and replace them with weigh category and median height for competitor for the same category

In [ ]:

# Let's merge the results
df_google=pd.read_csv("Data/google_reults")
df_wiki=pd.read_csv("Data/wiki_results")
Data_set_final=pd.concat([df_google, df_wiki])
Data_set_final.drop(columns=['Unnamed: 0'],inplace=True)
Data_set_final.rename(columns={"name":"Name"},inplace=True)
# Adding weight category to fill in the missing weights
Data_set_final=Data_set_final.merge(ADCC_final[['Name','submission','weight_class','sex']], on=["Name"], how="left")

Let's clean up the data

In [ ]:

fillers=Data_set_final[['height', 'weight','sex','weight_class']].groupby(['sex','weight_class']).mean()
print(fillers)
#  it looks like we don't have the average weight for female grapples for -60KG Let's just replace it with 60
fillers.fillna(60,inplace=True)
# Lets's fill up
Data_set_final['height'].fillna(Data_set_final[['height','sex','weight_class']].groupby(['sex','weight_class'])['height'].transform('mean'),inplace=True)
Data_set_final['weight'].fillna(Data_set_final[['weight','sex','weight_class']].groupby(['sex','weight_class'])['weight'].transform('mean'),inplace=True)
print(Data_set_final[Data_set_final['weight'].isna()][['sex','weight_class']])
# All Nan are in the female -60KG Lets fill it up with 60KG 
Data_set_final['weight'].fillna(60,inplace=True)
Data_set_final.to_csv('Data/Data_set_final')
Data_set_final=pd.read_csv('Data/Data_set_final',index_col=[0])
Data_set_final['gender']=(Data_set_final['sex']=='M')
Data_set_clean=Data_set_final
count=dict(Data_set_clean['submission'].value_counts())
Data_set_clean['sub_count']=Data_set_clean['submission']
Data_set_clean['sub_count']=Data_set_clean['submission'].map(count)
print(Data_set_clean.head())

In [ ]:

Data_set_clean.drop(Data_set_clean[Data_set_clean['sub_count']<10].index,inplace=True)
Data_set_clean.drop(Data_set_clean[Data_set_clean['submission']=='Submission'].index,inplace=True)
Data_set_clean.drop(columns=['sub_count'],inplace=True)
Data_set_clean.to_csv('Data/data_set_training')

In [ ]:

#  Let's get training
from sklearn.model_selection import ShuffleSplit, train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import  accuracy_score,precision_score,recall_score

Data_set_clean=pd.read_csv('Data/data_set_training',index_col=[0])
features=features=['height','weight','gender']
X=Data_set_clean[features].to_numpy()
y=Data_set_clean['submission']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sns.pairplot(Data_set_clean,hue='submission',vars=features,kind='scatter')

Training a knn classifier

In [ ]:

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)


k_values = [i for i in range (1,32)]
scores = []
ss = ShuffleSplit(n_splits=10, test_size=0.1, random_state=1)
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn, X_train_s, y_train, cv=ss)
    scores.append(np.mean(score))
sns.lineplot(x = k_values, y = scores, marker = 'o')
plt.xlabel("K Values")
plt.ylabel("Accuracy Score")


best_index = np.argmax(scores)
best_k = k_values[best_index]

knn = KNeighborsClassifier(n_neighbors=best_k)
X_s=scaler.transform(X)
knn.fit(X_s, y)
y_pred = knn.predict(X_test_s)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='weighted',zero_division=0.0)
recall = recall_score(y_test, y_pred,average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Training a descision tree

In [ ]:

from sklearn.tree import DecisionTreeClassifier, plot_tree
scores = []
clf = DecisionTreeClassifier(criterion='entropy')
score = cross_val_score(clf, X_train, y_train, cv=ss)


clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X, y)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='weighted',zero_division=0.0)
recall = recall_score(y_test, y_pred,average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Ploting the tree

In [ ]:

plot_tree(clf,feature_names=features,class_names=Data_set_clean['submission'].unique().tolist(),filled=True)
# plt.savefig('imgs/out.pdf')

In [ ]:

# saving the models
import pickle

knnPickle = open('models/knnpickle_file', 'wb') 
pickle.dump(knn, knnPickle) 
knnPickle.close() 

treePickle=open('models/treepickle_file', 'wb') 
pickle.dump(clf, treePickle)
treePickle.close()

scalerPickle=open('models/scalerpickle_file','wb')
pickle.dump(scaler, scalerPickle)
scalerPickle.close()

Building the interface To run the app run the command streamlit run app

In [ ]:

import numpy as np
import pandas as pd
import streamlit as st 
import pickle
from streamlit.connections import SQLConnection
from sqlalchemy.sql import text
from datetime import timedelta

def main(): 
    clf = pickle.load(open('models/treepickle_file', 'rb'))  

    # Initialize connection.
    conn = st.connection("postgresql", type="sql")

    st.title("Submission Finder")
    html_temp = """
    <h2 style="color:white;text-align:center;">Submission Finder </h2>
    </div>
    """
    st.markdown(html_temp, unsafe_allow_html = True)

    gender = st.selectbox("Gender",["Female","Male"]) 
    weight = st.text_input("Weight","95") 
    height= st.text_input("Height","179") 
    fav_sub=st.text_input("High percentage submission")



    if st.button("Predict"): 
        
        data = {'height': int(height), 'weight': int(weight),'gender':gender }
        df=pd.DataFrame([list(data.values())], columns=[['height', 'weight','gender']])
        df['gender_num']=df['gender']=='Male'
        features=['height','weight','gender_num']
        X=df[features].to_numpy()      
        # X=scaler.transform(X)
        y=clf.predict(X)
        with conn.session as s:
            s.execute(text('INSERT INTO physical_stats (height, weight, gender, submission, predsub) VALUES (:height, :weight, :gender, :submission, :predsub);'),
                params=dict(height=height, weight=weight, gender=gender, submission=fav_sub, predsub=y[0]))
            s.commit()
        st.success('Submission is {}'.format(y[0]))
      
if __name__=='__main__': 
    main()

BJJ submission Finder

Ibrahim Harrane

BJJ submission Finder

ADCC Historical Data

Ibrahim Harrane