# Compute correlation matrix for all events (Pearson correlation)
import numpy as np
from scipy.stats import pearsonr
import pandas as pd

results = pd.read_csv(
    'data/WCA_export_Results.tsv',
    sep='\t',
    dtype={'average': 'int', 'best': 'int'},
)
results['average'] = results['average'].fillna(0).astype(int)
results['best'] = results['best'].fillna(0).astype(int)

results['score'] = np.where(results['average'] > 0, results['average'], results['best'])

# Get best score for each person and event for ALL events
best_scores = results[results['score'] > 0].groupby(['personId', 'eventId'])['score'].min().reset_index()

# Get world record for each event (lowest score)
world_records = best_scores.groupby('eventId')['score'].min().to_dict()

# Compute performance ratio
best_scores['performance_ratio'] = best_scores.apply(
    lambda row: row['score'] / world_records[row['eventId']], axis=1
)

all_pivot = best_scores.pivot(index='personId', columns='eventId', values='performance_ratio')

# Filter events with enough data
min_competitors = 30
events_to_exclude = ['333mbf', '333mbo', 'magic', 'mmagic', '333ft']
valid_events = [eid for eid in all_pivot.columns if all_pivot[eid].count() >= min_competitors and eid not in events_to_exclude]

# Initialize correlation matrix for performance ratio
event_corr = pd.DataFrame(index=valid_events, columns=valid_events, dtype=float)

# Calculate correlation for each pair

for i, e1 in enumerate(valid_events):
    for j, e2 in enumerate(valid_events):
        if e1 == e2:
            event_corr.loc[e1, e2] = 1.0
        elif i > j:
            event_corr.loc[e1, e2] = event_corr.loc[e2, e1]  # Symmetric
        else:
            pair = all_pivot[[e1, e2]].dropna()
            if len(pair) >= min_competitors:
                try:
                    corr, _ = pearsonr(pair[e1], pair[e2])
                    event_corr.loc[e1, e2] = corr
                except ValueError:
                    event_corr.loc[e1, e2] = np.nan
            else:
                event_corr.loc[e1, e2] = np.nan

# Calculate correlation matrix using world ranking percentile

ranks = pd.read_csv('data/WCA_export_RanksAverage.tsv', sep='\t', dtype=str)
ranks['worldRank'] = pd.to_numeric(ranks['worldRank'], errors='coerce')
event_counts = ranks.groupby('eventId')['personId'].count().to_dict()
ranks['percentile'] = ranks.apply(
    lambda row: 1 - (row['worldRank'] - 1) / event_counts[row['eventId']], axis=1
)

pivot_pct = ranks.pivot(index='personId', columns='eventId', values='percentile')
min_competitors = 30
events_to_exclude = ['333mbf', '333mbo', 'magic', 'mmagic', '333ft']
valid_events = [eid for eid in pivot_pct.columns if pivot_pct[eid].count() >= min_competitors and eid not in events_to_exclude]

# Ensure valid_events only contains eventIds present in pivot_pct columns
valid_events = [eid for eid in valid_events if eid in pivot_pct.columns]

# Initialize correlation matrix for percentile
event_corr_pct = pd.DataFrame(index=valid_events, columns=valid_events, dtype=float)

# Calculate correlation for each pair
for i, e1 in enumerate(valid_events):
    for j, e2 in enumerate(valid_events):
        if e1 == e2:
            event_corr_pct.loc[e1, e2] = 1.0
        elif i > j:
            event_corr_pct.loc[e1, e2] = event_corr_pct.loc[e2, e1]  # Symmetric
        else:
            if e1 not in pivot_pct.columns or e2 not in pivot_pct.columns:
                event_corr_pct.loc[e1, e2] = np.nan
                continue
            pair = pivot_pct[[e1, e2]].dropna()
            if len(pair) >= min_competitors:
                try:
                    corr, _ = pearsonr(pair[e1], pair[e2])
                    event_corr_pct.loc[e1, e2] = corr
                except ValueError:
                    event_corr_pct.loc[e1, e2] = np.nan
            else:
                event_corr_pct.loc[e1, e2] = np.nan

event_corr_rounded = event_corr.round(2)
display(event_corr_rounded)

event_corr_pct_rounded = event_corr_pct.round(2)
display(event_corr_pct_rounded)

import seaborn as sb

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))
sb.heatmap(event_corr_pct_rounded, annot=True, cmap='coolwarm', vmin=0, vmax=1, linewidths=0.5)
plt.title('Correlation Heatmap (Percentile Method)')
plt.ylabel('Event')
plt.xlabel('Event')
plt.tight_layout()
plt.show()

display(event_corr_pct_rounded.loc[['555', '666', '777'], ['555', '666', '777']])

display(event_corr_pct_rounded.loc[['333'], ['444']])

blind_events = ['333bf', '444bf', '555bf']
other_events = [col for col in event_corr_pct_rounded.columns if col not in blind_events]
display(event_corr_pct_rounded.loc[blind_events, other_events])

display(event_corr_pct_rounded.loc[['333bf', '444bf', '555bf'], ['333bf', '444bf', '555bf']])

display(event_corr_rounded.loc[['333fm']])

display(event_corr_pct_rounded.loc[['333fm']])

display(event_corr_pct_rounded.loc[['sq1']])

display(event_corr_pct_rounded.loc[['222', 'skewb', 'pyram'], ['222', 'skewb', 'pyram']])

# List all events from valid_events and display the total number of competitors, sorted by popularity
event_popularity = {eid: all_pivot[eid].count() for eid in valid_events}
event_popularity_sorted = sorted(event_popularity.items(), key=lambda x: x[1], reverse=True)

total_competitors = all_pivot.index.nunique()

print(f"Total number of distinct competitors: {total_competitors:,}")
event_popularity_df = pd.DataFrame(event_popularity_sorted, columns=['eventId', 'num_competitors'])
event_popularity_df['percent_of_competitors'] = (
    pd.to_numeric(event_popularity_df['num_competitors']) / total_competitors * 100
).round(2).astype(str) + ' %'
event_popularity_df_v = event_popularity_df.copy()
event_popularity_df_v['num_competitors'] = event_popularity_df_v['num_competitors'].map('{:,}'.format)
display(event_popularity_df_v.style.hide(axis='index'))

Total number of distinct competitors: 268,229

# Prepare event popularity DataFrame with "All" row on top
event_pop_df_simple = event_popularity_df.copy()
event_pop_df_simple['percent'] = (
    pd.to_numeric(event_pop_df_simple['num_competitors']) / total_competitors * 100
).round(2)
all_row = pd.DataFrame([{'eventId': 'All', 'num_competitors': total_competitors, 'percent': 100.0}])
event_pop_df_simple = pd.concat([all_row, event_pop_df_simple], ignore_index=True)

plt.figure(figsize=(9, 6))
sb.barplot(
    data=event_pop_df_simple,
    x='num_competitors',
    y='eventId',
    order=event_pop_df_simple['eventId']
)
plt.title('Event popularity — number of distinct competitors')
plt.xlabel('Number of competitors')
plt.ylabel('Event')

for idx, row in event_pop_df_simple.iterrows():
    plt.text(row['num_competitors'] * 1.01, idx,
             f"{int(row['num_competitors']):,} ({row['percent']:.2f}%)",
             va='center', fontsize=9)

plt.xlim(0, event_pop_df_simple['num_competitors'].max() * 1.2)
plt.tight_layout()
plt.show()

# Find competitors who have NOT competed in 3x3
no_333 = all_pivot[all_pivot['333'].isna()]

# For these competitors, count how many have a non-null result in each event (excluding 333)
event_counts_no_333 = no_333.drop(columns=['333']).notna().sum().sort_values(ascending=False)

# Display as a DataFrame for readability
event_counts_no_333_df = event_counts_no_333.reset_index()
event_counts_no_333_df.columns = ['eventId', 'num_competitors']
display(event_counts_no_333_df.style.hide(axis='index'))

# Plot a horizontal bar chart for better visualization
plt.figure(figsize=(8, 6))
sb.barplot(
    data=event_counts_no_333_df,
    y='eventId',
    x='num_competitors',
)
plt.title('Event Participation of Competitors Who Never Competed in 3x3')
plt.xlabel('Number of Competitors')
plt.ylabel('Event')
plt.tight_layout()
plt.show()

	222	333	333bf	333fm	333oh	444	444bf	555	555bf	666	777	clock	minx	pyram	skewb	sq1
222	1.00	0.78	0.27	-0.02	0.68	0.69	0.20	0.62	0.15	0.54	0.48	0.38	0.57	0.57	0.58	0.47
333	0.78	1.00	0.31	-0.01	0.83	0.83	0.23	0.74	0.17	0.66	0.59	0.40	0.65	0.58	0.58	0.50
333bf	0.27	0.31	1.00	0.01	0.29	0.33	0.71	0.30	0.64	0.26	0.23	0.17	0.28	0.20	0.20	0.21
333fm	-0.02	-0.01	0.01	1.00	-0.01	-0.03	0.04	-0.02	0.02	0.01	-0.02	-0.02	-0.02	-0.01	-0.00	-0.02
333oh	0.68	0.83	0.29	-0.01	1.00	0.74	0.21	0.67	0.18	0.58	0.50	0.36	0.60	0.47	0.47	0.47
444	0.69	0.83	0.33	-0.03	0.74	1.00	0.24	0.85	0.20	0.77	0.69	0.40	0.70	0.53	0.51	0.52
444bf	0.20	0.23	0.71	0.04	0.21	0.24	1.00	0.22	0.83	0.23	0.21	0.15	0.18	0.17	0.11	0.16
555	0.62	0.74	0.30	-0.02	0.67	0.85	0.22	1.00	0.20	0.87	0.83	0.35	0.69	0.49	0.43	0.49
555bf	0.15	0.17	0.64	0.02	0.18	0.20	0.83	0.20	1.00	0.20	0.16	0.10	0.16	0.12	0.09	0.18
666	0.54	0.66	0.26	0.01	0.58	0.77	0.23	0.87	0.20	1.00	0.91	0.32	0.64	0.43	0.37	0.44
777	0.48	0.59	0.23	-0.02	0.50	0.69	0.21	0.83	0.16	0.91	1.00	0.27	0.61	0.41	0.33	0.42
clock	0.38	0.40	0.17	-0.02	0.36	0.40	0.15	0.35	0.10	0.32	0.27	1.00	0.44	0.45	0.45	0.39
minx	0.57	0.65	0.28	-0.02	0.60	0.70	0.18	0.69	0.16	0.64	0.61	0.44	1.00	0.57	0.53	0.52
pyram	0.57	0.58	0.20	-0.01	0.47	0.53	0.17	0.49	0.12	0.43	0.41	0.45	0.57	1.00	0.59	0.46
skewb	0.58	0.58	0.20	-0.00	0.47	0.51	0.11	0.43	0.09	0.37	0.33	0.45	0.53	0.59	1.00	0.44
sq1	0.47	0.50	0.21	-0.02	0.47	0.52	0.16	0.49	0.18	0.44	0.42	0.39	0.52	0.46	0.44	1.00

	222	333	333bf	333fm	333oh	444	444bf	555	555bf	666	777	clock	minx	pyram	skewb	sq1
222	1.00	0.88	0.19	0.36	0.66	0.67	0.17	0.55	0.24	0.46	0.39	0.51	0.53	0.73	0.69	0.50
333	0.88	1.00	0.24	0.39	0.78	0.78	0.20	0.67	0.33	0.57	0.50	0.48	0.58	0.68	0.66	0.50
333bf	0.19	0.24	1.00	0.28	0.22	0.24	0.73	0.22	0.66	0.21	0.16	0.19	0.20	0.16	0.14	0.21
333fm	0.36	0.39	0.28	1.00	0.45	0.41	0.19	0.40	0.20	0.38	0.33	0.22	0.37	0.29	0.30	0.35
333oh	0.66	0.78	0.22	0.45	1.00	0.76	0.12	0.67	0.19	0.56	0.48	0.41	0.60	0.50	0.50	0.51
444	0.67	0.78	0.24	0.41	0.76	1.00	0.20	0.85	0.25	0.74	0.67	0.44	0.67	0.51	0.51	0.54
444bf	0.17	0.20	0.73	0.19	0.12	0.20	1.00	0.17	0.85	0.15	0.11	0.10	0.11	0.14	0.09	0.16
555	0.55	0.67	0.22	0.40	0.67	0.85	0.17	1.00	0.29	0.88	0.82	0.39	0.66	0.42	0.41	0.51
555bf	0.24	0.33	0.66	0.20	0.19	0.25	0.85	0.29	1.00	0.23	0.26	0.12	0.16	0.23	0.23	0.25
666	0.46	0.57	0.21	0.38	0.56	0.74	0.15	0.88	0.23	1.00	0.93	0.36	0.62	0.36	0.37	0.47
777	0.39	0.50	0.16	0.33	0.48	0.67	0.11	0.82	0.26	0.93	1.00	0.30	0.56	0.31	0.32	0.43
clock	0.51	0.48	0.19	0.22	0.41	0.44	0.10	0.39	0.12	0.36	0.30	1.00	0.43	0.52	0.53	0.47
minx	0.53	0.58	0.20	0.37	0.60	0.67	0.11	0.66	0.16	0.62	0.56	0.43	1.00	0.47	0.48	0.53
pyram	0.73	0.68	0.16	0.29	0.50	0.51	0.14	0.42	0.23	0.36	0.31	0.52	0.47	1.00	0.69	0.46
skewb	0.69	0.66	0.14	0.30	0.50	0.51	0.09	0.41	0.23	0.37	0.32	0.53	0.48	0.69	1.00	0.49
sq1	0.50	0.50	0.21	0.35	0.51	0.54	0.16	0.51	0.25	0.47	0.43	0.47	0.53	0.46	0.49	1.00

Event name in dataset	Full event name
222	2x2 cube
333	3x3 cube
333bf	3x3 blindfolded
333oh	3x3 one-handed
444	4x4 cube
444bf	4x4 blindfolded
555	5x5 cube
555bf	5x5 blindfolded
666	6x6 cube
777	7x7 cube
clock	Rubik's clock
minx	Megaminx
pyram	Pyraminx
skewb	Skewb
sq1	Square-1

	555	666	777
555	1.00	0.88	0.82
666	0.88	1.00	0.93
777	0.82	0.93	1.00

	222	333	333fm	333oh	444	555	666	777	clock	minx	pyram	skewb	sq1
333bf	0.19	0.24	0.28	0.22	0.24	0.22	0.21	0.16	0.19	0.20	0.16	0.14	0.21
444bf	0.17	0.20	0.19	0.12	0.20	0.17	0.15	0.11	0.10	0.11	0.14	0.09	0.16
555bf	0.24	0.33	0.20	0.19	0.25	0.29	0.23	0.26	0.12	0.16	0.23	0.23	0.25

Statistical project – correlations across events in speedcubing¶

Section 1: Introduction to speedcubing¶

Section 2: Competition rules¶

Glossary¶

Section 3: Puzzle types¶

3x3¶

2x2¶

4x4¶

5x5¶

6x6¶

7x7¶

Megaminx¶

Skewb¶

Pyraminx¶

Square-1¶

Rubik's clock¶

3x3 One Handed¶

Blindfolded events¶

FMC (Fewest Moves Challenge)¶

Section 4: Goal of this statistical project, methodology, hypotheses¶

Measuring how "good" a speedcuber is at some event¶

My hypotheses¶

What correlations I expect¶

Section 5: Calculating the correlations¶

Section 6: Results interpretation¶

Hypothesis 1: 5x5, 6x6 and 7x7¶

Hypothesis 2: 3x3 and 4x4¶

Hypothesis 3: Blindfolded events¶

Hypothesis 4: 3x3 fewest moves¶

Hypothesis 5: Square-1¶

Hypothesis 6: 2x2, Skewb and Pyraminx¶

Section 7: Other interesting data¶

Disclaimer¶

eventId	num_competitors	percent_of_competitors
333	258,078	96.22 %
222	170,130	63.43 %
pyram	117,934	43.97 %
444	76,460	28.51 %
333oh	67,171	25.04 %
skewb	66,653	24.85 %
555	37,009	13.8 %
minx	31,047	11.57 %
clock	26,979	10.06 %
sq1	22,885	8.53 %
666	15,465	5.77 %
777	12,447	4.64 %
333fm	11,298	4.21 %
333bf	11,033	4.11 %
444bf	1,961	0.73 %
555bf	1,014	0.38 %

eventId	num_competitors
222	5068
pyram	5064
skewb	1155
clock	573
444	530
magic	310
minx	295
333oh	292
555	240
333bf	161
sq1	156
666	78
333fm	77
777	68
mmagic	68
333mbf	42
444bf	31
555bf	15
333ft	5
333mbo	3