Home Artificial Intelligence Predicting the Unpredictable — March Madness using Snowpark and Hex Solution Overview Putting It All Together Model Training Inference For 2023 Conclusion

Predicting the Unpredictable — March Madness using Snowpark and Hex Solution Overview Putting It All Together Model Training Inference For 2023 Conclusion

2
Predicting the Unpredictable — March Madness using Snowpark and Hex
Solution Overview
Putting It All Together
Model Training
Inference For 2023
Conclusion

Photo by Jacob Rice on Unsplash

Data Ingestion

with zipfile.ZipFile("march-machine-learning-mania-2023.zip") as zf:
for file in zf.filelist:
if file.filename.endswith(".csv"):
with zf.open(file.filename) as z:
df = pd.read_csv(z, encoding="iso-8859-1")
table_name = file.filename.split("/")[-1].replace(".csv", "").upper()
df.columns = [col.upper() for col in df.columns]
session.create_dataframe(df).write.save_as_table(
table_name=table_name, mode="overwrite"
)py

Feature Engineering

Aggregations

Win Locations

Region and Seed Values

Coach Tenure

def extract_coaches_tenure(
session: snowflake.snowpark.Session,
source_table: str,
target_table: str,
exclude_seasons: list,
) -> str:
df = session.table(source_table)

if exclude_seasons:
df = df.filter(~F.col("SEASON").isin(exclude_seasons))

# There are some teams which have multiple coaches in a season.
# We'd like to leverage a Window Functions to account for this and
# One other function count up tenure.
coaches_tenure = (
df.with_column(
"ROW_NUM",
F.row_number().over(
Window.partition_by(["SEASON", "TEAMID"]).order_by(
F.col("LASTDAYNUM").desc()
)
),
)
.filter(F.col("ROW_NUM") == 1)
.select("SEASON", "TEAMID", "COACHNAME")
.with_column(
"COACH_TENURE",
F.row_number().over(
Window.partition_by(["TEAMID", "COACHNAME"]).order_by(F.col("SEASON"))
),
)
)

coaches_tenure.write.save_as_table(target_table, mode="overwrite")

return f"Successfully created {target_table}."

Best Tournament Finish

Win/Loss Statistics

import xgboost as xgb
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

train_df = df_pd[df_pd["SEASON"] < 2020]
test_df = df_pd[df_pd["SEASON"] > 2020]

X_train = train_df.drop("WIN_INDICATOR", axis=1)
y_train = train_df["WIN_INDICATOR"]

X_test = test_df.drop("WIN_INDICATOR", axis=1)
y_test = test_df["WIN_INDICATOR"]

model = xgb.XGBClassifier(n_estimators=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

def tourney_predict(session: Session) -> str:
df = session.table("FEATURES.JOINED_FEATURES_TO_TOURNEY_TEAMS")

new_cols = {}

for c in df.columns:
if c.startswith("W"):
new_cols[c] = "L" + c[1:]
elif c.startswith("L"):
new_cols[c] = "W" + c[1:]
else:
new_cols[c] = c

df_flipped = df.select([F.col(c).alias(new_cols.get(c, c)) for c in df.columns]).select(
*[col for col in df.columns]
)

df = df.with_column("WIN_INDICATOR", F.lit(1))
df_flipped = df_flipped.with_column("WIN_INDICATOR", F.lit(0))

df = df.union_all(df_flipped)

df_pd = df.to_pandas()

new_dtypes = {}
for c in df_pd.columns:
if str(df_pd[c].dtype).startswith("u"):
new_dtypes[c] = str(df_pd[c].dtype).replace("u", "")
else:
new_dtypes[c] = str(df_pd[c].dtype)

df_pd = df_pd.astype(new_dtypes)

train_df = df_pd[df_pd["SEASON"] < 2020]
test_df = df_pd[df_pd["SEASON"] > 2020]

X_train = train_df.drop("WIN_INDICATOR", axis=1)
y_train = train_df["WIN_INDICATOR"]

X_test = test_df.drop("WIN_INDICATOR", axis=1)
y_test = test_df["WIN_INDICATOR"]

model = xgb.XGBClassifier(n_estimators=1000)
model.fit(X_train, y_train)

return "success"

session.sproc.register(func=tourney_predict,
name="COMMON.TOURNEY_PREDICT",
packages=['snowflake-snowpark-python','pandas','xgboost'],
is_permanent=True,
stage_location="@COMMON.PYTHON_CODE",
replace=True)

Clean & Prep 2023 data

from copy import copy

m_teams_2023 = (
session.table("RAW.MTEAMCONFERENCES")
.filter(F.col("SEASON") == 2023)
.drop("CONFABBREV")
)
w_teams_2023 = (
session.table("RAW.WTEAMCONFERENCES")
.filter(F.col("SEASON") == 2023)
.drop("CONFABBREV")
)
m_teams_2023 = m_teams_2023.join(
copy(m_teams_2023), how="cross", lsuffix="_L", rsuffix="_R"
).filter(F.col("TEAMID_L") < F.col("TEAMID_R"))
w_teams_2023 = w_teams_2023.join(
copy(w_teams_2023), how="cross", lsuffix="_L", rsuffix="_R"
).filter(F.col("TEAMID_L") < F.col("TEAMID_R"))
teams_2023 = m_teams_2023.union_all(w_teams_2023)
teams_2023 = teams_2023.with_column(
"ID",
F.concat(
F.col("SEASON_L"), F.lit("_"), F.col("TEAMID_L"), F.lit("_"), F.col("TEAMID_R")
),
)

Perform Inference for Winning Probabilities

train_df = df_pd[df_pd["SEASON"] <= 2022]

X_train = train_df.drop(["SEASON", "WIN_INDICATOR"], axis=1)
y_train = train_df["WIN_INDICATOR"]

model_2022 = xgb.XGBClassifier(n_estimators=1000)
model_2022.fit(X_train, y_train)

X_pred = JOINED_FEATURES_TO_TOURNEY_TEAMS_pd.drop(["ID"], axis=1)
y_pred = model.predict_proba(X_pred)[:, 1]

submission_df = pd.DataFrame(JOINED_FEATURES_TO_TOURNEY_TEAMS_pd["ID"])
submission_df["Pred"] = y_pred

session.create_dataframe(submission_df).write.save_as_table(
"FEATURES.M_SUBMISSION_2023", mode="overwrite"
)

split_pred = (
session.table("FEATURES.M_SUBMISSION_2023")
.with_column("TEAM1", F.solid(F.split(F.col("ID"), F.lit("_"))[1], T.IntegerType()))
.with_column("TEAM2", F.solid(F.split(F.col("ID"), F.lit("_"))[2], T.IntegerType()))
.select("ID", "TEAM1", "TEAM2", '"Pred"')
)

teams = session.table("RAW.MTEAMS")

t1_join = (
split_pred.join(teams, on=split_pred.TEAM1 == teams.TEAMID)
.with_column_renamed("TEAMNAME", "T1NAME")
.drop("TEAMID", "FIRSTD1SEASON", "LASTD1SEASON")
)
t2_join = (
t1_join.join(teams, on=split_pred.TEAM2 == teams.TEAMID)
.with_column_renamed("TEAMNAME", "T2NAME")
.drop("TEAMID", "FIRSTD1SEASON", "LASTD1SEASON")
)

t2_join = t2_join.with_column_renamed('"Pred"', "PREDICTION")

t2_join.write.save_as_table("FEATURES.M_SUBMISSION_TEAMNAMES_2023", mode="overwrite")

2 COMMENTS

LEAVE A REPLY

Please enter your comment!
Please enter your name here