#pipeline architecture
from IPython.display import Image, display
display(Image('pipeline_notebook_images/pipeline_architecture.png', width=1600))

#postgres schema
display(Image('pipeline_notebook_images/postgres_schema.png', width=1200))

#mongodb schema
display(Image('pipeline_notebook_images/mongodb_schema.png', width=1600))

#summary
display(Image('pipeline_notebook_images/schema_summary.png', width=1600))

# Load the master dataset from Parquet (Lakehouse pattern: Spark → Parquet → DuckDB)
import duckdb, pandas as pd

con = duckdb.connect()
con.execute("""
    CREATE OR REPLACE VIEW games AS
    SELECT * FROM read_parquet('../storage/master_dataset.parquet/**/*.parquet')
""")
df = con.execute("SELECT * FROM games").df()
print(f"Dataset: {df.shape[0]} games x {df.shape[1]} columns (from Parquet)")
df.head()

Dataset: 1216 games x 62 columns (from Parquet)

import warnings
warnings.filterwarnings('ignore')

# con, df, and games view already created from Parquet in cell above

# ── Success tier thresholds (based on SteamSpy owners_estimate) ──
# Breakout Hit      : 1,000,000+ owners  — viral, culturally significant
# Strong Success    :   200,000–999,999  — commercially healthy, healthy community
# Moderate Success  :    20,000–199,999  — viable, recouped costs
# Underperformer    :         < 20,000   — failed to find audience
TIERS = ['Breakout Hit', 'Strong Success', 'Moderate Success', 'Underperformer']
TIER_COLORS = {
    'Breakout Hit':      '#7c3aed',
    'Strong Success':    '#2563eb',
    'Moderate Success':  '#16a34a',
    'Underperformer':    '#ef4444',
}

# ── Shorthand colour aliases & helper ─────────────────────────────────────────
C_BREAKOUT = '#7c3aed'
C_STRONG   = '#2563eb'
C_MODERATE = '#16a34a'
C_UNDER    = '#ef4444'

def clean_ax(ax, keep_left=True):
    """Strip chart chrome for the dark theme."""
    for spine in ['top', 'right']:
        ax.spines[spine].set_visible(False)
    if not keep_left:
        ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_edgecolor(GRID)
    if keep_left:
        ax.spines['left'].set_edgecolor(GRID)
    ax.set_facecolor(CARD)
    ax.tick_params(colors=SUBTEXT)

print(f"Ready: {df.shape[0]} games x {df.shape[1]} columns (DuckDB → Parquet)")
print("\nTier breakdown (using owners_estimate from SteamSpy):")
for tier in TIERS:
    if tier == 'Breakout Hit':
        n = (df['owners_estimate'] >= 1_000_000).sum()
    elif tier == 'Strong Success':
        n = ((df['owners_estimate'] >= 200_000) & (df['owners_estimate'] < 1_000_000)).sum()
    elif tier == 'Moderate Success':
        n = ((df['owners_estimate'] >= 20_000) & (df['owners_estimate'] < 200_000)).sum()
    else:
        n = (df['owners_estimate'] < 20_000).sum()
    print(f"  {tier:20s}: {n:4d} games")

Ready: 1216 games x 62 columns (DuckDB → Parquet)

Tier breakdown (using owners_estimate from SteamSpy):
  Breakout Hit        :  368 games
  Strong Success      :  490 games
  Moderate Success    :  255 games
  Underperformer      :  103 games

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

# ── Theme ─────────────────────────────────────────────────────────────────────
BG    = '#0D1117'
CARD  = '#161B22'
GRID  = '#21262D'
TEXT  = '#E6EDF3'
SUBTEXT = '#8B949E'

plt.rcParams.update({
    'figure.facecolor':  BG,
    'axes.facecolor':    CARD,
    'axes.edgecolor':    GRID,
    'axes.labelcolor':   TEXT,
    'axes.titlepad':     16,
    'text.color':        TEXT,
    'xtick.color':       SUBTEXT,
    'ytick.color':       SUBTEXT,
    'grid.color':        GRID,
    'grid.alpha':        0.4,
    'font.family':       'sans-serif',
    'font.size':         11,
    'axes.titlesize':    15,
    'axes.labelsize':    12,
    'figure.titlesize':  18,
    'legend.facecolor':  CARD,
    'legend.edgecolor':  GRID,
    'legend.fontsize':   10,
})

# Tier palette mapped to neon accents
tier_order  = ['Breakout Hit', 'Strong Success', 'Moderate Success', 'Underperformer']
tier_colors = ['#D2A8FF', '#58A6FF', '#7EE787', '#FF7B72']   # purple, blue, green, red

tier_counts = df['success_tier'].value_counts().reindex(tier_order)

fig, axes = plt.subplots(1, 2, figsize=(14, 5), facecolor=BG)
fig.patch.set_facecolor(BG)

# ── Left: horizontal bar chart ─────────────────────────────────────────────
bars = axes[0].barh(
    tier_order[::-1], tier_counts[tier_order[::-1]],
    color=tier_colors[::-1], edgecolor=BG, linewidth=1.5
)
for bar, count in zip(bars, tier_counts[tier_order[::-1]]):
    axes[0].text(
        bar.get_width() + 8, bar.get_y() + bar.get_height() / 2,
        f'{count:,}  ({count/1216*100:.1f}%)',
        va='center', fontsize=10, color=SUBTEXT
    )
axes[0].set_xlim(0, 1400)
axes[0].set_xlabel('Number of games', fontsize=11, color=SUBTEXT)
axes[0].set_title('Games per success tier', fontsize=13, fontweight='600', color=TEXT, pad=14)
axes[0].tick_params(axis='y', labelsize=11, colors=TEXT)
axes[0].tick_params(axis='x', colors=SUBTEXT)
axes[0].set_facecolor(CARD)
for spine in axes[0].spines.values():
    spine.set_edgecolor(GRID)
axes[0].spines['left'].set_visible(False)
axes[0].spines['top'].set_visible(False)
axes[0].spines['right'].set_visible(False)
axes[0].yaxis.set_tick_params(length=0)
axes[0].grid(True, axis='x', alpha=0.2, color=GRID)

# ── Right: ownership KDE distribution ─────────────────────────────────────
for tier, color in zip(tier_order, tier_colors):
    vals = df.loc[df.success_tier == tier, 'owners_estimate'].dropna()
    vals = vals[vals > 0]
    if len(vals) > 1:
        sns.kdeplot(np.log10(vals), ax=axes[1], color=color, fill=True,
                    alpha=0.25, linewidth=2, label=tier)

# Threshold lines
axes[1].axvline(np.log10(20_000),    color='#7EE787', ls='--', lw=1.2, alpha=0.5)
axes[1].axvline(np.log10(200_000),   color='#58A6FF', ls='--', lw=1.2, alpha=0.5)
axes[1].axvline(np.log10(1_000_000), color='#D2A8FF', ls='--', lw=1.2, alpha=0.5)

axes[1].set_xticks([3, 4, np.log10(20_000), 5, np.log10(200_000), 6, np.log10(1_000_000), 7])
axes[1].set_xticklabels(['1k', '10k', '20k', '100k', '200k', '1M', '1M', '10M'],
                         fontsize=9, color=SUBTEXT)
axes[1].set_xlabel('Estimated owners (log scale)', fontsize=11, color=SUBTEXT)
axes[1].set_yticks([])
axes[1].set_title('Ownership distribution across tiers', fontsize=13, fontweight='600',
                   color=TEXT, pad=14)
axes[1].legend(fontsize=10, frameon=True, facecolor=CARD, edgecolor=GRID, labelcolor=TEXT)
axes[1].set_facecolor(CARD)
for spine in axes[1].spines.values():
    spine.set_edgecolor(GRID)
axes[1].spines['left'].set_visible(False)
axes[1].spines['top'].set_visible(False)
axes[1].spines['right'].set_visible(False)

fig.suptitle(
    'Q1 · Four-Tier Success Landscape: 1,216 Indie Games on Steam',
    fontsize=14, fontweight='700', color=TEXT, y=1.02
)
plt.tight_layout()
plt.show()

# Q2: Price bracket vs success tier — cross-tabulation.
# For each price tier, shows the breakdown across all four success tiers,
# revealing whether price is a differentiator between Breakout Hits and Underperformers
# or whether it flattens out once you control for the middle tiers.

q2 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT price_tier,
        COUNT(*) AS total_games,
        SUM(CASE WHEN success_tier='Breakout Hit'     THEN 1 ELSE 0 END) AS breakout_hits,
        SUM(CASE WHEN success_tier='Strong Success'   THEN 1 ELSE 0 END) AS strong_successes,
        SUM(CASE WHEN success_tier='Moderate Success' THEN 1 ELSE 0 END) AS moderate_successes,
        SUM(CASE WHEN success_tier='Underperformer'   THEN 1 ELSE 0 END) AS underperformers,
        ROUND(SUM(CASE WHEN success_tier='Breakout Hit' THEN 1 ELSE 0 END)*100.0/COUNT(*),1) AS pct_breakout,
        ROUND(AVG(peak_ccu), 0)                       AS avg_peak_ccu,
        ROUND(AVG(owners_estimate), 0)                AS avg_owners
    FROM tiered WHERE price_tier IS NOT NULL
    GROUP BY price_tier ORDER BY avg_owners DESC
""").df()
display(q2)

# ── Q2 Chart: Price Tier × Success Tier breakdown (revised) ──────────────────




# ── Prep ──────────────────────────────────────────────────────────────────────
price_order = ['free', 'under_5', '5_to_15', '15_to_30', 'premium_30+']
tier_label_map = {
    'free': 'Free', 'under_5': 'Under $5',
    '5_to_15': '$5–15', '15_to_30': '$15–30', 'premium_30+': '$30+'
}

q2_plot = q2[q2['price_tier'].isin(price_order)].copy()
q2_plot['price_tier'] = pd.Categorical(q2_plot['price_tier'], categories=price_order, ordered=True)
q2_plot = q2_plot.sort_values('price_tier')
q2_plot['label'] = q2_plot['price_tier'].map(tier_label_map)

labels    = q2_plot['label'].tolist()
totals    = q2_plot['total_games'].values
breakouts = q2_plot['breakout_hits'].values
strongs   = q2_plot['strong_successes'].values
moderates = q2_plot['moderate_successes'].values
unders    = q2_plot['underperformers'].values
pct_b     = q2_plot['pct_breakout'].values
avg_own   = q2_plot['avg_owners'].values / 1e6

pct_break  = breakouts  / totals * 100
pct_strong = strongs    / totals * 100
pct_mod    = moderates  / totals * 100
pct_under  = unders     / totals * 100

x = np.arange(len(labels))
W = 0.55

# ── Layout: title row + 3 panels + legend row ─────────────────────────────────
fig = plt.figure(figsize=(18, 7), facecolor=BG)

# Leave room at bottom for legend
gs = fig.add_gridspec(2, 3, height_ratios=[1, 0.07], hspace=0.18, wspace=0.32,
                      left=0.05, right=0.97, top=0.88, bottom=0.14)

ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[0, 2])

fig.suptitle('Q2 · Price Tier × Success Tier — Where Do Breakout Hits Come From?',
             fontsize=15, fontweight='700', color=TEXT, y=0.97)

# ── Panel 1: Stacked bar ──────────────────────────────────────────────────────
b1 = ax1.bar(x, pct_break,  width=W, color=C_BREAKOUT,  alpha=0.92, edgecolor=BG, label='Breakout Hit')
b2 = ax1.bar(x, pct_strong, width=W, bottom=pct_break,  color=C_STRONG,   alpha=0.92, edgecolor=BG, label='Strong Success')
b3 = ax1.bar(x, pct_mod,    width=W, bottom=pct_break+pct_strong,          color=C_MODERATE, alpha=0.92, edgecolor=BG, label='Moderate Success')
b4 = ax1.bar(x, pct_under,  width=W, bottom=pct_break+pct_strong+pct_mod,  color=C_UNDER,    alpha=0.92, edgecolor=BG, label='Underperformer')

# Annotate breakout % inside bar, only if segment tall enough
for i, pb in enumerate(pct_break):
    if pb > 8:
        ax1.text(i, pb / 2, f'{pb:.0f}%', ha='center', va='center',
                 fontsize=10, fontweight='bold', color=BG)

# n= totals above bars
for i, tot in enumerate(totals):
    ax1.text(i, 103, f'n={tot}', ha='center', va='bottom', fontsize=8, color=SUBTEXT)

ax1.set_xticks(x); ax1.set_xticklabels(labels, fontsize=10)
ax1.set_ylabel('% of games', color=SUBTEXT, fontsize=10)
ax1.set_title('Tier breakdown per price band', fontsize=12, color=TEXT, pad=12)
ax1.set_ylim(0, 116)
ax1.grid(True, axis='y', alpha=0.15)
clean_ax(ax1)

# ── Panel 2: Breakout rate line ───────────────────────────────────────────────
ax2.plot(x, pct_b, color=C_BREAKOUT, linewidth=3, marker='o', markersize=10,
         markerfacecolor=C_BREAKOUT, markeredgecolor=BG, markeredgewidth=2, zorder=5)
ax2.fill_between(x, pct_b, alpha=0.12, color=C_BREAKOUT)

for i, v in enumerate(pct_b):
    offset = 14 if i != len(pct_b)-1 else -20
    ax2.annotate(f'{v:.1f}%', (i, v), textcoords='offset points',
                 xytext=(0, offset), ha='center', fontsize=11,
                 fontweight='bold', color=C_BREAKOUT)

ax2.set_xticks(x); ax2.set_xticklabels(labels, fontsize=10)
ax2.set_ylabel('% Breakout Hits', color=SUBTEXT, fontsize=10)
ax2.set_title('Breakout hit rate by price', fontsize=12, color=TEXT, pad=12)
ax2.set_ylim(0, max(pct_b) * 1.55)
ax2.grid(True, axis='y', alpha=0.15)
clean_ax(ax2)

# ── Panel 3: Avg owners ───────────────────────────────────────────────────────
max_idx = np.argmax(avg_own)
bar_colors = [C_BREAKOUT if i == max_idx else '#2D333B' for i in range(len(avg_own))]
bars = ax3.bar(x, avg_own, color=bar_colors, alpha=0.9, edgecolor=BG, width=W)

for bar, val in zip(bars, avg_own):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04,
             f'{val:.2f}M', ha='center', va='bottom',
             fontsize=10, fontweight='bold', color=TEXT)

ax3.set_xticks(x); ax3.set_xticklabels(labels, fontsize=10)
ax3.set_ylabel('Avg estimated owners (M)', color=SUBTEXT, fontsize=10)
ax3.set_title('Avg ownership by price tier', fontsize=12, color=TEXT, pad=12)
ax3.grid(True, axis='y', alpha=0.15)
clean_ax(ax3)

# ── Legend: single row below all panels ──────────────────────────────────────
legend_patches = [
    mpatches.Patch(color=C_BREAKOUT,  label='Breakout Hit  (≥1M owners)'),
    mpatches.Patch(color=C_STRONG,    label='Strong Success  (200k–1M)'),
    mpatches.Patch(color=C_MODERATE,  label='Moderate Success  (20k–200k)'),
    mpatches.Patch(color=C_UNDER,     label='Underperformer  (<20k)'),
]
fig.legend(handles=legend_patches, loc='lower center', ncol=4,
           fontsize=10, frameon=True, facecolor=CARD, edgecolor=GRID,
           bbox_to_anchor=(0.5, 0.01), handlelength=1.2, handleheight=1.0,
           columnspacing=2.0)

plt.show()

# Q3: Platform count (from IGDB) vs success tier.
# Breaks down the four success tiers by number of platforms available,
# allowing us to see if Breakout Hits disproportionately appear on more platforms.

q3 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT num_platforms, COUNT(*) AS num_games,
        SUM(CASE WHEN success_tier='Breakout Hit'     THEN 1 ELSE 0 END) AS breakout_hits,
        SUM(CASE WHEN success_tier='Strong Success'   THEN 1 ELSE 0 END) AS strong_successes,
        SUM(CASE WHEN success_tier='Moderate Success' THEN 1 ELSE 0 END) AS moderate_successes,
        SUM(CASE WHEN success_tier='Underperformer'   THEN 1 ELSE 0 END) AS underperformers,
        ROUND(AVG(igdb_critic_score), 1)              AS avg_critic_score,
        ROUND(AVG(owners_estimate), 0)                AS avg_owners,
        ROUND(AVG(peak_ccu), 0)                       AS avg_peak_ccu
    FROM tiered WHERE num_platforms IS NOT NULL
    GROUP BY num_platforms ORDER BY num_platforms
""").df()
display(q3)

# ── Q3 Chart: Platform Count × Success Tier (v3) ─────────────────────────────

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

BG         = '#0D1117'
CARD       = '#161B22'
GRID       = '#21262D'
TEXT       = '#E6EDF3'
SUBTEXT    = '#8B949E'
C_BREAKOUT = '#D2A8FF'
C_STRONG   = '#58A6FF'
C_MODERATE = '#7EE787'
C_UNDER    = '#FF7B72'

plt.rcParams.update({
    'figure.facecolor': BG, 'axes.facecolor': CARD,
    'axes.edgecolor': GRID, 'axes.labelcolor': TEXT,
    'text.color': TEXT, 'xtick.color': SUBTEXT, 'ytick.color': SUBTEXT,
    'grid.color': GRID, 'grid.alpha': 0.4, 'font.family': 'sans-serif',
    'font.size': 11, 'legend.facecolor': CARD, 'legend.edgecolor': GRID,
})

def clean_ax(ax):
    ax.set_facecolor(CARD)
    for sp in ['top', 'right', 'left']:
        ax.spines[sp].set_visible(False)
    ax.spines['bottom'].set_edgecolor(GRID)
    ax.tick_params(colors=SUBTEXT)

# ── Prep ──────────────────────────────────────────────────────────────────────
q3_plot = q3[(q3['num_games'] >= 10) & (q3['num_platforms'] <= 8)].copy()
q3_plot = q3_plot.sort_values('num_platforms')

plat     = q3_plot['num_platforms'].astype(int).tolist()
totals   = q3_plot['num_games'].values
avg_own  = q3_plot['avg_owners'].values / 1e6
avg_crit = q3_plot['avg_critic_score'].fillna(0).values

pct_break  = q3_plot['breakout_hits'].values    / totals * 100
pct_strong = q3_plot['strong_successes'].values / totals * 100
pct_mod    = q3_plot['moderate_successes'].values / totals * 100
pct_under  = q3_plot['underperformers'].values  / totals * 100

x     = np.arange(len(plat))
xlabs = [str(p) for p in plat]
W     = 0.6

# ── Figure ────────────────────────────────────────────────────────────────────
fig = plt.figure(figsize=(18, 7), facecolor=BG)
gs  = fig.add_gridspec(1, 2, wspace=0.35, left=0.05, right=0.95, top=0.85, bottom=0.14)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])

fig.suptitle('Q3 · Platform Count vs Success — Does Being Everywhere Pay Off?',
             fontsize=15, fontweight='700', color=TEXT, y=1.04)

# ── Panel 1: Stacked % bars ───────────────────────────────────────────────────
b_under = ax1.bar(x, pct_under,  width=W, color=C_UNDER,    alpha=0.92, edgecolor=BG)
b_mod   = ax1.bar(x, pct_mod,    width=W, bottom=pct_under, color=C_MODERATE, alpha=0.92, edgecolor=BG)
b_str   = ax1.bar(x, pct_strong, width=W, bottom=pct_under+pct_mod, color=C_STRONG, alpha=0.92, edgecolor=BG)
b_brk   = ax1.bar(x, pct_break,  width=W, bottom=pct_under+pct_mod+pct_strong, color=C_BREAKOUT, alpha=0.92, edgecolor=BG)

for i, (pb, pu, pm, ps) in enumerate(zip(pct_break, pct_under, pct_mod, pct_strong)):
    base = pu + pm + ps
    if pb > 7:
        ax1.text(i, base + pb / 2, f'{pb:.0f}%',
                 ha='center', va='center', fontsize=9, fontweight='bold', color=BG)
    ax1.text(i, 102, f'n={totals[i]}', ha='center', va='bottom', fontsize=8, color=SUBTEXT)

ax1.set_xticks(x)
ax1.set_xticklabels(xlabs, fontsize=10)
ax1.set_xlabel('Number of platforms (IGDB)', color=SUBTEXT, fontsize=10)
ax1.set_ylabel('% of games', color=SUBTEXT, fontsize=10)
ax1.set_title('Success tier mix by platform count', fontsize=12, color=TEXT, pad=12)
ax1.set_ylim(0, 116)
ax1.grid(True, axis='y', alpha=0.15)
clean_ax(ax1)

# ── Panel 2: Clean line chart — platforms vs avg ownership + breakout % ───────
ax2b = ax2.twinx()

bar_cols = [C_BREAKOUT if o == avg_own.max() else '#2D333B' for o in avg_own]
ax2.bar(x, avg_own, width=W, color=bar_cols, alpha=0.75, edgecolor=BG, zorder=2)

ax2b.plot(x, pct_break, color=C_BREAKOUT, linewidth=2.5, marker='o', markersize=8,
          markerfacecolor=C_BREAKOUT, markeredgecolor=BG, markeredgewidth=2, zorder=5)
ax2b.fill_between(x, pct_break, alpha=0.08, color=C_BREAKOUT)

for i, v in enumerate(avg_own):
    ax2.text(i, v + 0.05, f'{v:.1f}M', ha='center', va='bottom',
             fontsize=9, fontweight='bold', color=TEXT)

for i, v in enumerate(pct_break):
    offset = 12 if i % 2 == 0 else -18
    ax2b.annotate(f'{v:.0f}%', (i, v), textcoords='offset points',
                  xytext=(0, offset), ha='center', fontsize=9,
                  fontweight='bold', color=C_BREAKOUT)

ax2.set_xticks(x)
ax2.set_xticklabels(xlabs, fontsize=10)
ax2.set_xlabel('Number of platforms (IGDB)', color=SUBTEXT, fontsize=10)
ax2.set_ylabel('Avg estimated owners (M)', color=SUBTEXT, fontsize=10)
ax2b.set_ylabel('Breakout hit rate (%)', color=C_BREAKOUT, fontsize=10)
ax2b.tick_params(axis='y', colors=C_BREAKOUT)
ax2b.spines['right'].set_edgecolor(C_BREAKOUT)
ax2.set_title('Avg ownership & breakout rate\nby number of platforms', fontsize=12, color=TEXT, pad=12)
ax2.grid(True, axis='y', alpha=0.15)
clean_ax(ax2)
ax2.spines['right'].set_visible(True)
ax2.spines['right'].set_edgecolor(GRID)

# ── Shared legend ─────────────────────────────────────────────────────────────
legend_patches = [
    mpatches.Patch(color=C_BREAKOUT,  label='Breakout Hit  (≥1M owners)'),
    mpatches.Patch(color=C_STRONG,    label='Strong Success  (200k–1M)'),
    mpatches.Patch(color=C_MODERATE,  label='Moderate Success  (20k–200k)'),
    mpatches.Patch(color=C_UNDER,     label='Underperformer  (<20k)'),
]
fig.legend(handles=legend_patches, loc='lower center', ncol=4,
           fontsize=10, frameon=True, facecolor=CARD, edgecolor=GRID,
           bbox_to_anchor=(0.5, 0.01), handlelength=1.2, columnspacing=2.0)

plt.show()

# Q4: VADER review sentiment vs success tier.
# Shows whether review sentiment separates the four tiers — particularly whether
# Breakout Hits attract qualitatively different language than Moderate Successes.
# avg_vader_compound is the mean VADER compound score across all English Steam reviews
# for each game, computed by the Spark pipeline UDF.

q4 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT success_tier, tier_rank,
        COUNT(*) AS num_games,
        ROUND(AVG(avg_vader_compound), 4)        AS avg_vader_score,
        ROUND(AVG(pct_positive_reviews), 1)      AS avg_pct_positive,
        ROUND(AVG(avg_playtime_at_review), 0)    AS avg_playtime_at_review_mins,
        ROUND(AVG(num_reviews_analysed), 0)      AS avg_reviews_per_game,
        ROUND(AVG(owners_estimate), 0)           AS avg_owners
    FROM tiered
    WHERE avg_vader_compound IS NOT NULL
    GROUP BY success_tier, tier_rank
    ORDER BY tier_rank
""").df()
display(q4)

# ── Q4 Chart: VADER Sentiment × Success Tier (v2) ────────────────────────────




# ── Prep ──────────────────────────────────────────────────────────────────────

q4_plot     = q4.set_index('success_tier').reindex(tier_order).reset_index()
vader       = q4_plot['avg_vader_score'].values.astype(float)
pct_pos     = q4_plot['avg_pct_positive'].values.astype(float)
playtime    = q4_plot['avg_playtime_at_review_mins'].values.astype(float)
n_reviews   = q4_plot['avg_reviews_per_game'].values.astype(float)

x = np.arange(4)
W = 0.5
short_labels = ['Breakout', 'Strong', 'Moderate', 'Under']
bar_colors = [TIER_COLORS[t] for t in tier_order]

# ── Figure ────────────────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 3, figsize=(17, 5.5), facecolor=BG)
fig.subplots_adjust(left=0.06, right=0.96, top=0.84, bottom=0.18, wspace=0.4)

fig.suptitle('Q4 · Review Sentiment — Do Players Feel Differently About Breakout Hits?',
             fontsize=14, fontweight='700', color=TEXT, y=0.97)

# ── Panel 1: VADER score ──────────────────────────────────────────────────────
ax = axes[0]
bars = ax.bar(x, vader, width=W, color=bar_colors, alpha=0.88, edgecolor=BG, zorder=3)
ax.axhline(0, color=SUBTEXT, linewidth=1, linestyle='--', alpha=0.4, zorder=2)

vmin = min(vader) * 0.9
vmax = max(vader) * 1.18
ax.set_ylim(vmin, vmax)

for bar, val in zip(bars, vader):
    ax.text(bar.get_x() + bar.get_width()/2, val + (vmax - vmin) * 0.02,
            f'{val:.3f}', ha='center', va='bottom',
            fontsize=11, fontweight='bold', color=TEXT)

ax.set_xticks(x)
ax.set_xticklabels(short_labels, fontsize=9.5)
ax.set_ylabel('Avg VADER compound score', fontsize=10)
ax.set_title('VADER sentiment score', fontsize=12, color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15, zorder=0)
clean_ax(ax)

# ── Panel 2: % Positive reviews ───────────────────────────────────────────────
ax = axes[1]
bars2 = ax.bar(x, pct_pos, width=W, color=bar_colors, alpha=0.88, edgecolor=BG, zorder=3)

pmin = min(pct_pos) * 0.92
pmax = max(pct_pos) * 1.12
ax.set_ylim(pmin, pmax)

for bar, val in zip(bars2, pct_pos):
    ax.text(bar.get_x() + bar.get_width()/2, val + (pmax - pmin) * 0.02,
            f'{val:.1f}%', ha='center', va='bottom',
            fontsize=11, fontweight='bold', color=TEXT)

ax.set_xticks(x)
ax.set_xticklabels(short_labels, fontsize=9.5)
ax.set_ylabel('Avg % positive reviews', fontsize=10)
ax.set_title('Steam review positivity', fontsize=12, color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15, zorder=0)
clean_ax(ax)

# ── Panel 3: Playtime at review (hrs) + review volume line ───────────────────
ax  = axes[2]
ax2 = ax.twinx()

playtime_hrs = playtime / 60
bars3 = ax.bar(x, playtime_hrs, width=W, color=bar_colors, alpha=0.75, edgecolor=BG, zorder=3)

tmin = 0
tmax = max(playtime_hrs) * 1.3
ax.set_ylim(tmin, tmax)

for bar, val in zip(bars3, playtime_hrs):
    ax.text(bar.get_x() + bar.get_width()/2, val + tmax * 0.02,
            f'{val:.0f}h', ha='center', va='bottom',
            fontsize=11, fontweight='bold', color=TEXT)

ax2.plot(x, n_reviews, color='#FFA657', linewidth=2.5, marker='o', markersize=8,
         markerfacecolor='#FFA657', markeredgecolor=BG, markeredgewidth=2, zorder=5)

for i, val in enumerate(n_reviews):
    offset = 12 if i < 2 else -18
    ax2.annotate(f'{int(val)}', (i, val), textcoords='offset points',
                 xytext=(0, offset), ha='center', fontsize=10,
                 fontweight='bold', color='#FFA657')

ax.set_xticks(x)
ax.set_xticklabels(short_labels, fontsize=9.5)
ax.set_ylabel('Avg playtime at review (hrs)', fontsize=10)
ax2.set_ylabel('Avg reviews per game', color='#FFA657', fontsize=10)
ax2.tick_params(axis='y', colors='#FFA657')
ax2.spines['right'].set_edgecolor('#FFA657')
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax.set_title('Playtime at review & review volume', fontsize=12, color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15, zorder=0)
clean_ax(ax)
ax.spines['right'].set_visible(True)
ax.spines['right'].set_edgecolor(GRID)

# ── Shared legend ─────────────────────────────────────────────────────────────
legend_patches = [
    mpatches.Patch(color=C_BREAKOUT, label='Breakout Hit  (≥1M owners)'),
    mpatches.Patch(color=C_STRONG,   label='Strong Success  (200k–1M)'),
    mpatches.Patch(color=C_MODERATE, label='Moderate Success  (20k–200k)'),
    mpatches.Patch(color=C_UNDER,    label='Underperformer  (<20k)'),
]
fig.legend(handles=legend_patches, loc='lower center', ncol=4,
           fontsize=10, frameon=True, facecolor=CARD, edgecolor=GRID,
           bbox_to_anchor=(0.5, 0.01), handlelength=1.2, columnspacing=2.0)

plt.show()

# Q5: Dedicated subreddit presence vs success tier.
# A 4-tier breakdown shows whether the community-building effect is gradual
# (Moderate Success → Strong Success → Breakout Hit) or a binary jump.
# has_dedicated_subreddit was computed from Reddit JSON endpoint data.

q5 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT success_tier, tier_rank,
        CAST(has_dedicated_subreddit AS VARCHAR) AS has_subreddit,
        COUNT(*) AS num_games,
        ROUND(AVG(subreddit_subscribers), 0)  AS avg_subscribers,
        ROUND(AVG(total_reddit_posts), 0)     AS avg_posts,
        ROUND(AVG(reddit_engagement), 1)      AS avg_engagement,
        ROUND(AVG(owners_estimate), 0)        AS avg_owners
    FROM tiered
    GROUP BY success_tier, tier_rank, has_dedicated_subreddit
    ORDER BY tier_rank, has_dedicated_subreddit DESC
""").df()
display(q5)

# ── Q5 Chart: Subreddit Presence × Success Tier (v3) ─────────────────────────




# ── Prep ──────────────────────────────────────────────────────────────────────

has_sub = q5[q5['has_subreddit'] == 'true'].set_index('success_tier').reindex(tier_order)
no_sub  = q5[q5['has_subreddit'] == 'false'].set_index('success_tier').reindex(tier_order)

total_per_tier = has_sub['num_games'].fillna(0).values + no_sub['num_games'].fillna(0).values
pct_has_sub    = has_sub['num_games'].fillna(0).values / total_per_tier * 100

# Subscribers: log scale to handle Breakout Hit dominance
avg_subs_raw = has_sub['avg_subscribers'].fillna(0).values
avg_posts    = has_sub['avg_posts'].fillna(0).values

x = np.arange(4)
W = 0.5
bar_colors = [TIER_COLORS[t] for t in tier_order]

# ── Figure ────────────────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 3, figsize=(17, 5.5), facecolor=BG)
fig.subplots_adjust(left=0.06, right=0.96, top=0.84, bottom=0.18, wspace=0.4)

fig.suptitle('Q5 · Reddit Community — Does Having a Subreddit Signal Success?',
             fontsize=14, fontweight='700', color=TEXT, y=0.97)

# ── Panel 1: % with subreddit — dot + bar combo ───────────────────────────────
ax = axes[0]

# Background 100% track
ax.barh(x, [100]*4, height=W, color=GRID, alpha=0.5, edgecolor='none')
# Filled portion = has subreddit
bars = ax.barh(x, pct_has_sub, height=W, color=bar_colors, alpha=0.88, edgecolor='none')

for i, (pct, tot) in enumerate(zip(pct_has_sub, total_per_tier)):
    ax.text(pct + 2, i, f'{pct:.0f}%', va='center', fontsize=12,
            fontweight='bold', color=TEXT)
    ax.text(101, i - 0.38, f'n={int(tot)}', va='top', fontsize=8, color=SUBTEXT)

ax.set_yticks(x)
ax.set_yticklabels(short_labels, fontsize=10, color=TEXT)
ax.set_xlim(0, 125)
ax.set_xlabel('% of games with a subreddit', fontsize=10)
ax.set_title('Subreddit ownership rate\nper success tier', fontsize=12, color=TEXT, pad=10)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_edgecolor(GRID)
ax.spines['bottom'].set_edgecolor(GRID)
ax.set_facecolor(CARD)
ax.tick_params(colors=SUBTEXT)
ax.grid(False)

# ── Panel 2: Subscribers on log scale ────────────────────────────────────────
ax = axes[1]
log_subs = np.log10(np.clip(avg_subs_raw, 1, None))
bars2 = ax.bar(x, log_subs, width=W, color=bar_colors, alpha=0.88, edgecolor=BG)

lmax = max(log_subs) * 1.2
ax.set_ylim(0, lmax)

for bar, raw, lv in zip(bars2, avg_subs_raw, log_subs):
    if raw >= 1000:
        label = f'{raw/1000:.0f}k'
    else:
        label = f'{int(raw)}'
    ax.text(bar.get_x() + bar.get_width()/2, lv + lmax * 0.02,
            label, ha='center', va='bottom',
            fontsize=11, fontweight='bold', color=TEXT)

tick_vals = [3, 4, 5]
ax.set_yticks(tick_vals)
ax.set_yticklabels(['1k', '10k', '100k'], fontsize=9)
ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9.5)
ax.set_ylabel('Avg subscribers (log scale)', fontsize=10)
ax.set_title('Avg subreddit size\n(games with subreddit only)', fontsize=12, color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)
clean_ax(ax)

# ── Panel 3: Avg Reddit posts per tier ───────────────────────────────────────
ax = axes[2]
bars3 = ax.bar(x, avg_posts, width=W, color=bar_colors, alpha=0.88, edgecolor=BG)

pmax = max(avg_posts) * 1.28
ax.set_ylim(0, pmax)

for bar, val in zip(bars3, avg_posts):
    ax.text(bar.get_x() + bar.get_width()/2, val + pmax * 0.02,
            f'{int(val)}', ha='center', va='bottom',
            fontsize=11, fontweight='bold', color=TEXT)

ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9.5)
ax.set_ylabel('Avg Reddit posts', fontsize=10)
ax.set_title('Reddit post volume\n(games with subreddit only)', fontsize=12, color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)
clean_ax(ax)

# ── Shared legend ─────────────────────────────────────────────────────────────
legend_patches = [
    mpatches.Patch(color=C_BREAKOUT, label='Breakout Hit  (≥1M owners)'),
    mpatches.Patch(color=C_STRONG,   label='Strong Success  (200k–1M)'),
    mpatches.Patch(color=C_MODERATE, label='Moderate Success  (20k–200k)'),
    mpatches.Patch(color=C_UNDER,    label='Underperformer  (<20k)'),
]
fig.legend(handles=legend_patches, loc='lower center', ncol=4,
           fontsize=10, frameon=True, facecolor=CARD, edgecolor=GRID,
           bbox_to_anchor=(0.5, 0.01), handlelength=1.2, columnspacing=2.0)

plt.show()

# Q6: Google Trends pre/post-launch interest vs success tier.
# Tests whether pre-launch buzz is progressively stronger for higher tiers,
# or whether it only separates Breakout Hits from everyone else.
# google_interest_pre_launch and google_interest_post_launch are engineered
# features computed in spark_pipeline.py from 69,552 weekly search data points.

q6 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT success_tier, tier_rank,
        COUNT(*) AS num_games,
        ROUND(AVG(google_interest_pre_launch), 2)   AS avg_pre_launch_interest,
        ROUND(AVG(google_interest_post_launch), 2)  AS avg_post_launch_interest,
        ROUND(AVG(days_to_peak_interest), 0)        AS avg_days_to_peak,
        COUNT(CASE WHEN trend_direction='rising' THEN 1 END)  AS rising_trends,
        COUNT(CASE WHEN trend_direction='falling' THEN 1 END) AS falling_trends,
        ROUND(AVG(owners_estimate), 0)              AS avg_owners
    FROM tiered
    WHERE google_interest_pre_launch IS NOT NULL
    GROUP BY success_tier, tier_rank
    ORDER BY tier_rank
""").df()
display(q6)

# ── Q6 Chart: Google Trends Pre/Post Launch × Success Tier (v3) ──────────────




# ── Prep ──────────────────────────────────────────────────────────────────────

q6_plot   = q6.set_index('success_tier').reindex(tier_order).reset_index()
pre       = q6_plot['avg_pre_launch_interest'].values.astype(float)
post      = q6_plot['avg_post_launch_interest'].values.astype(float)
days_peak = q6_plot['avg_days_to_peak'].values.astype(float)
rising    = q6_plot['rising_trends'].values.astype(float)
falling   = q6_plot['falling_trends'].values.astype(float)
n_games   = q6_plot['num_games'].values.astype(float)
rising_pct  = rising  / n_games * 100
falling_pct = falling / n_games * 100

days_peak_clamped = np.clip(days_peak, 0, None)

x = np.arange(4)
W = 0.35
bar_colors = [TIER_COLORS[t] for t in tier_order]

# ── Figure ────────────────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 3, figsize=(17, 5.5), facecolor=BG)
fig.subplots_adjust(left=0.06, right=0.96, top=0.84, bottom=0.18, wspace=0.4)

fig.suptitle('Q6 · Google Trends — Does Pre-Launch Buzz Predict Breakout Success?',
             fontsize=14, fontweight='700', color=TEXT, y=0.97)

# ── Panel 1: Pre vs Post launch interest ─────────────────────────────────────
ax = axes[0]

bars_pre  = ax.bar(x - W/2, pre,  width=W, color=bar_colors, alpha=0.88, edgecolor=BG)
bars_post = ax.bar(x + W/2, post, width=W, color=bar_colors, alpha=0.35, edgecolor=BG)

vmax = max(max(pre), max(post)) * 1.28
ax.set_ylim(0, vmax)

for i, (p1, p2) in enumerate(zip(pre, post)):
    ax.text(i - W/2, p1 + vmax*0.02, f'{p1:.1f}',
            ha='center', va='bottom', fontsize=9, fontweight='bold', color=TEXT)
    ax.text(i + W/2, p2 + vmax*0.02, f'{p2:.1f}',
            ha='center', va='bottom', fontsize=9, color=SUBTEXT)

ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9.5)
ax.set_ylabel('Avg Google interest score', fontsize=10)
ax.set_title('Pre vs post-launch\nGoogle search interest', fontsize=12, color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)

p1 = mpatches.Patch(color='#E6EDF3', alpha=0.88, label='Pre-launch')
p2 = mpatches.Patch(color='#E6EDF3', alpha=0.35, label='Post-launch')
ax.legend(handles=[p1, p2], fontsize=9, frameon=True,
          facecolor=CARD, edgecolor=GRID, loc='upper right')
clean_ax(ax)

# ── Panel 2: Days to peak – log scale so all tiers are visible ────────────────
ax = axes[1]

days_display = np.where(days_peak_clamped > 0, days_peak_clamped, 0.5)

bars2 = ax.bar(x, days_display, width=W+0.1, color=bar_colors, alpha=0.88, edgecolor=BG)
ax.set_yscale('log')
ax.set_ylim(0.3, max(days_display) * 3)

for i, (bar, raw) in enumerate(zip(bars2, days_peak)):
    if raw < 0:
        label = 'Pre-launch\npeak'
        col = SUBTEXT
    elif raw == 0:
        label = '<1d'
        col = SUBTEXT
    else:
        label = f'{int(raw)}d'
        col = TEXT
    ax.text(bar.get_x() + bar.get_width()/2,
            bar.get_height() * 1.15,
            label, ha='center', va='bottom',
            fontsize=10, fontweight='bold', color=col)

ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9.5)
ax.set_ylabel('Days to peak interest (log scale)', fontsize=10)
ax.set_title('How quickly does search\ninterest peak post-launch?', fontsize=12, color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)
clean_ax(ax)

# ── Panel 3: Rising vs falling ───────────────────────────────────────────────
ax = axes[2]

display_order = list(reversed(range(4)))
bar_h = 0.3
spacing = 1.0
y_positions = [i * spacing for i in range(4)]

for plot_i, tier_i in enumerate(display_order):
    y     = y_positions[plot_i]
    col   = bar_colors[tier_i]
    rp    = rising_pct[tier_i]
    fp    = falling_pct[tier_i]

    ax.barh(y + bar_h/2 + 0.02, 100, height=bar_h, color=GRID, alpha=0.4, edgecolor='none')
    ax.barh(y + bar_h/2 + 0.02, rp,  height=bar_h, color=col,  alpha=0.9, edgecolor='none')
    ax.text(rp + 2, y + bar_h/2 + 0.02, f'{rp:.0f}% rising',
            va='center', fontsize=9, fontweight='bold', color=TEXT)

    ax.barh(y - bar_h/2 - 0.02, 100, height=bar_h, color=GRID, alpha=0.4, edgecolor='none')
    ax.barh(y - bar_h/2 - 0.02, fp,  height=bar_h, color=col,  alpha=0.3, edgecolor='none')
    ax.text(fp + 2, y - bar_h/2 - 0.02, f'{fp:.0f}% falling',
            va='center', fontsize=9, color=SUBTEXT)

ax.set_yticks(y_positions)
ax.set_yticklabels([short_labels[i] for i in display_order], fontsize=9.5, color=TEXT)
ax.set_xlim(0, 140)
ax.set_xlabel('% of games in tier', fontsize=10)
ax.set_title('Trend direction after launch\n(rising vs falling interest)', fontsize=12, color=TEXT, pad=10)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_edgecolor(GRID)
ax.spines['bottom'].set_edgecolor(GRID)
ax.set_facecolor(CARD)
ax.tick_params(colors=SUBTEXT)
ax.grid(False)

# ── Shared legend ─────────────────────────────────────────────────────────────
legend_patches = [
    mpatches.Patch(color=C_BREAKOUT, label='Breakout Hit  (≥1M owners)'),
    mpatches.Patch(color=C_STRONG,   label='Strong Success  (200k–1M)'),
    mpatches.Patch(color=C_MODERATE, label='Moderate Success  (20k–200k)'),
    mpatches.Patch(color=C_UNDER,    label='Underperformer  (<20k)'),
]
fig.legend(handles=legend_patches, loc='lower center', ncol=4,
           fontsize=10, frameon=True, facecolor=CARD, edgecolor=GRID,
           bbox_to_anchor=(0.5, 0.01), handlelength=1.2, columnspacing=2.0)

plt.show()

# Q7: Top 20 breakout hits ranked by peak concurrent users.
# Restricted to games in the Breakout Hit tier (1M+ owners) so the ranking
# is within the elite tier rather than conflating tiers. Includes all key signals.

q7 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT name, success_tier, peak_ccu,
        ROUND(owners_estimate, 0)           AS owners_estimate,
        price,
        ROUND(review_positivity_ratio, 3)   AS review_ratio,
        ROUND(avg_vader_compound, 3)        AS vader_score,
        num_platforms,
        COALESCE(subreddit_subscribers, 0)  AS reddit_subs,
        ROUND(trend_peak, 0)               AS trend_peak
    FROM tiered
    WHERE success_tier = 'Breakout Hit'
    ORDER BY peak_ccu DESC LIMIT 20
""").df()
display(q7)

# ── Q7 Chart: Top 20 Breakout Hits by Peak CCU ──────────────────────────────

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

BG         = '#0D1117'
CARD       = '#161B22'
GRID       = '#21262D'
TEXT       = '#E6EDF3'
SUBTEXT    = '#8B949E'
C_BREAKOUT = '#D2A8FF'

plt.rcParams.update({
    'figure.facecolor': BG, 'axes.facecolor': CARD,
    'axes.edgecolor': GRID, 'axes.labelcolor': SUBTEXT,
    'text.color': TEXT, 'xtick.color': SUBTEXT, 'ytick.color': SUBTEXT,
    'grid.color': GRID, 'grid.alpha': 0.4, 'font.family': 'sans-serif',
    'font.size': 11, 'legend.facecolor': CARD, 'legend.edgecolor': GRID,
})

def clean_ax(ax, keep_left=True):
    ax.set_facecolor(CARD)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(keep_left)
    ax.spines['bottom'].set_edgecolor(GRID)
    if keep_left: ax.spines['left'].set_edgecolor(GRID)
    ax.tick_params(colors=SUBTEXT)

q7_plot = q7.sort_values('peak_ccu', ascending=True).tail(20)

fig, ax = plt.subplots(figsize=(12, 8), facecolor=BG)
clean_ax(ax, keep_left=False)

bars = ax.barh(
    range(len(q7_plot)), q7_plot['peak_ccu'].values,
    color=C_BREAKOUT, edgecolor=BG, linewidth=0.8, alpha=0.85
)

for i, (_, row) in enumerate(q7_plot.iterrows()):
    name = row['name'][:30] + '…' if len(str(row['name'])) > 30 else row['name']
    ccu = row['peak_ccu']
    ax.text(ccu + max(q7_plot['peak_ccu']) * 0.01, i, f'{name}  ({ccu:,.0f})',
            va='center', fontsize=9, color=TEXT, fontweight='500')

ax.set_yticks([])
ax.set_xlabel('Peak Concurrent Users', fontsize=11, color=SUBTEXT)
ax.set_xlim(0, max(q7_plot['peak_ccu']) * 1.45)
ax.grid(True, axis='x', alpha=0.15, color=GRID)

fig.suptitle('Q7 · Top 20 Breakout Hits Ranked by Peak Concurrent Users',
             fontsize=14, fontweight='700', color=TEXT, y=0.98)
plt.tight_layout()
plt.show()

# Q8: Deep profile of Underperformers (<20k owners) grouped by price tier.
# Previously only showed 1 row due to NULL price_tier — COALESCE maps NULLs to 'unknown'.
# Now also includes avg_owners within the underperformer tier and comparison
# against the Moderate Success tier average for context.

q8 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT COALESCE(price_tier, 'unknown') AS price_tier,
        COUNT(*) AS underperformer_games,
        ROUND(AVG(owners_estimate), 0)          AS avg_owners,
        ROUND(AVG(price), 2)                    AS avg_price,
        ROUND(AVG(peak_ccu), 0)                 AS avg_peak_ccu,
        ROUND(AVG(review_positivity_ratio), 3)  AS avg_review_ratio,
        ROUND(AVG(avg_vader_compound), 4)       AS avg_vader,
        ROUND(AVG(num_platforms), 1)            AS avg_platforms,
        ROUND(AVG(total_reddit_posts), 0)       AS avg_reddit_posts,
        SUM(CASE WHEN has_dedicated_subreddit THEN 1 ELSE 0 END) AS with_subreddit
    FROM tiered WHERE success_tier = 'Underperformer'
    GROUP BY price_tier ORDER BY underperformer_games DESC
""").df()
display(q8)

# ── Q8 Chart: What Do Underperformers Look Like? ────────────────────────────




# ── Prep ──────────────────────────────────────────────────────────────────────
price_display = {
    'free': 'Free', 'under_5': 'Under $5', '5_to_15': '$5–15',
    '15_to_30': '$15–30', 'premium_30+': '$30+', 'unknown': 'Unknown'
}
q8_plot = q8.copy()
q8_plot['label'] = q8_plot['price_tier'].map(price_display).fillna(q8_plot['price_tier'])
q8_plot['subreddit_pct'] = q8_plot['with_subreddit'] / q8_plot['underperformer_games'] * 100

labels = q8_plot['label'].tolist()
x = np.arange(len(labels))
w = 0.35

fig, axes = plt.subplots(1, 3, figsize=(17, 5.5), facecolor=BG)
fig.subplots_adjust(left=0.06, right=0.96, top=0.84, bottom=0.18, wspace=0.35)

fig.suptitle('Q8 · Anatomy of an Underperformer: Where the <20k-Owner Games Cluster',
             fontsize=14, fontweight='700', color=TEXT, y=0.97)

# ── Panel 1: Distribution across price tiers ─────────────────────────────────
ax = axes[0]
clean_ax(ax)
bars = ax.bar(x, q8_plot['underperformer_games'], color=C_UNDER, edgecolor=BG, alpha=0.85)
for b in bars:
    n = int(b.get_height())
    pct = n / q8_plot['underperformer_games'].sum() * 100
    ax.text(b.get_x() + b.get_width()/2, b.get_height() + 1,
            f'{n}  ({pct:.0f}%)', ha='center', fontsize=9, color=TEXT)
ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=9)
ax.set_title('Where underperformers price themselves', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.set_ylabel('Games', fontsize=10, color=SUBTEXT)
ax.grid(True, axis='y', alpha=0.15)

# ── Panel 2: Review ratio vs VADER ───────────────────────────────────────────
ax = axes[1]
clean_ax(ax)
ax.bar(x - w/2, q8_plot['avg_review_ratio'], w, color=C_STRONG, alpha=0.85, label='Review ratio')
ax.bar(x + w/2, q8_plot['avg_vader'].fillna(0), w, color=C_MODERATE, alpha=0.85, label='VADER sentiment')

for i, (rr, vd) in enumerate(zip(q8_plot['avg_review_ratio'], q8_plot['avg_vader'].fillna(0))):
    ax.text(i - w/2, rr + 0.01, f'{rr:.2f}', ha='center', fontsize=8, color=TEXT)
    ax.text(i + w/2, vd + 0.01, f'{vd:.2f}', ha='center', fontsize=8, color=TEXT)

ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=9)
ax.set_title('Quality isn\'t the problem', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.set_ylabel('Score', fontsize=10, color=SUBTEXT)
ax.set_ylim(0, 1.05)
ax.legend(fontsize=9, labelcolor=TEXT)
ax.grid(True, axis='y', alpha=0.15)

# ── Panel 3: Community presence — subreddit % with Reddit posts annotated ────
ax = axes[2]
clean_ax(ax)
bars = ax.bar(x, q8_plot['subreddit_pct'], width=0.6, color='#FFA657', edgecolor=BG, alpha=0.85)

for i, (b, rp) in enumerate(zip(bars, q8_plot['avg_reddit_posts'].fillna(0))):
    pct = q8_plot['subreddit_pct'].iloc[i]
    # % label above bar
    ax.text(b.get_x() + b.get_width()/2, pct + 2,
            f'{pct:.0f}%', ha='center', fontsize=10, fontweight='bold', color=TEXT)
    # Reddit posts below the x-axis label
    ax.text(b.get_x() + b.get_width()/2, -9,
            f'~{int(rp)} posts', ha='center', fontsize=8, color=SUBTEXT)

ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=9)
ax.set_title('Visibility is the problem', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.set_ylabel('% with dedicated subreddit', fontsize=10, color=SUBTEXT)
ax.set_ylim(-15, 115)
ax.grid(True, axis='y', alpha=0.15)

plt.show()

# Q9: Side-by-side comparison of all key features across the four success tiers.
# This is the primary evidence table — replaces the binary Successful/Failed comparison
# with four-way granularity to reveal whether signals are gradual or threshold effects.

q9 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT success_tier, tier_rank,
        COUNT(*) AS n,
        ROUND(AVG(owners_estimate), 0)          AS avg_owners,
        ROUND(AVG(price), 2)                    AS avg_price,
        ROUND(AVG(peak_ccu), 0)                 AS avg_ccu,
        ROUND(AVG(review_positivity_ratio), 3)  AS review_ratio,
        ROUND(AVG(avg_vader_compound), 4)       AS vader,
        ROUND(AVG(num_platforms), 1)            AS platforms,
        ROUND(AVG(total_reddit_posts), 0)       AS reddit_posts,
        ROUND(AVG(subreddit_subscribers), 0)    AS reddit_subs,
        ROUND(AVG(trend_peak), 1)               AS trend_peak,
        ROUND(AVG(steam_platform_count), 1)     AS steam_platforms,
        ROUND(AVG(playtime_retention_ratio), 3) AS retention,
        ROUND(AVG(achievements), 0)             AS achievements
    FROM tiered
    GROUP BY success_tier, tier_rank
    ORDER BY tier_rank
""").df()
display(q9)

# ── Q9 Chart: What Actually Separates the Tiers? ────────────────────────────




# ── Prep ──────────────────────────────────────────────────────────────────────
q9_plot = q9.set_index('success_tier').reindex(tier_order)
x = np.arange(4)
bar_colors = [TIER_COLORS[t] for t in tier_order]

fig, axes = plt.subplots(2, 3, figsize=(17, 10), facecolor=BG)
fig.suptitle('Q9 · What Actually Separates the Success Tiers?',
             fontsize=15, fontweight='700', color=TEXT, y=0.98)

# ── Row 1: Metrics that VARY (the differentiators) ───────────────────────────
fig.text(0.03, 0.75, 'DIFFERENTIATORS', fontsize=11, fontweight='700',
         color='#58A6FF', rotation=90, va='center', ha='center')

# Peak CCU
ax = axes[0, 0]
clean_ax(ax)
vals = q9_plot['avg_ccu'].values
ax.bar(x, vals, color=bar_colors, edgecolor=BG, alpha=0.85, width=0.55)
for i, v in enumerate(vals):
    ax.text(i, v + max(vals)*0.03, f'{v:,.0f}', ha='center', fontsize=9, color=TEXT)
ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9)
ax.set_title('Peak concurrent users', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)

# Reddit Subs
ax = axes[0, 1]
clean_ax(ax)
vals = q9_plot['reddit_subs'].values
ax.bar(x, vals, color=bar_colors, edgecolor=BG, alpha=0.85, width=0.55)
for i, v in enumerate(vals):
    ax.text(i, v + max(vals)*0.03, f'{v:,.0f}', ha='center', fontsize=9, color=TEXT)
ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9)
ax.set_title('Avg subreddit subscribers', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)

# Trend Peak
ax = axes[0, 2]
clean_ax(ax)
vals = q9_plot['trend_peak'].values
ax.bar(x, vals, color=bar_colors, edgecolor=BG, alpha=0.85, width=0.55)
for i, v in enumerate(vals):
    ax.text(i, v + max(vals)*0.03, f'{v:.1f}', ha='center', fontsize=9, color=TEXT)
ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9)
ax.set_title('Google Trends peak interest', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)

# ── Row 2: Metrics that DON'T vary (quality is flat) ─────────────────────────
fig.text(0.03, 0.28, 'QUALITY\n(FLAT)', fontsize=11, fontweight='700',
         color='#8B949E', rotation=90, va='center', ha='center')

# Review Ratio
ax = axes[1, 0]
clean_ax(ax)
vals = q9_plot['review_ratio'].values
ax.bar(x, vals, color=bar_colors, edgecolor=BG, alpha=0.85, width=0.55)
for i, v in enumerate(vals):
    ax.text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=9, color=TEXT)
ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9)
ax.set_title('Review positivity ratio', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.set_ylim(0, 1.1)
ax.grid(True, axis='y', alpha=0.15)

# VADER
ax = axes[1, 1]
clean_ax(ax)
vals = q9_plot['vader'].values
ax.bar(x, vals, color=bar_colors, edgecolor=BG, alpha=0.85, width=0.55)
for i, v in enumerate(vals):
    ax.text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=9, color=TEXT)
ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9)
ax.set_title('VADER sentiment score', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.set_ylim(0, 1.1)
ax.grid(True, axis='y', alpha=0.15)

# Retention
ax = axes[1, 2]
clean_ax(ax)
vals = q9_plot['retention'].values
ax.bar(x, vals, color=bar_colors, edgecolor=BG, alpha=0.85, width=0.55)
for i, v in enumerate(vals):
    ax.text(i, v + max(vals)*0.03, f'{v:.3f}', ha='center', fontsize=9, color=TEXT)
ax.set_xticks(x); ax.set_xticklabels(short_labels, fontsize=9)
ax.set_title('Playtime retention ratio', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax.grid(True, axis='y', alpha=0.15)

plt.tight_layout(rect=[0.05, 0, 1, 0.95])
plt.show()

# Q10: Playtime retention metrics across success tiers.
# playtime_retention_ratio = median_playtime / average_playtime (closer to 1 = even retention).
# FIX retained: outlier filter < 50000 mins removes Desktop Kitten Girl anomaly.
# Four-tier version reveals whether Breakout Hits have qualitatively different
# retention curves or just more of the same.

q10 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT success_tier, tier_rank,
        COUNT(*) AS num_games,
        ROUND(AVG(average_playtime_forever), 0)   AS avg_playtime_mins,
        ROUND(AVG(median_playtime_forever), 0)    AS median_playtime_mins,
        ROUND(AVG(playtime_retention_ratio), 3)   AS avg_retention_ratio,
        ROUND(AVG(steamspy_avg_playtime), 0)      AS steamspy_avg_playtime,
        ROUND(AVG(steamspy_median_playtime), 0)   AS steamspy_median_playtime,
        ROUND(AVG(owners_estimate), 0)            AS avg_owners
    FROM tiered
    WHERE playtime_retention_ratio IS NOT NULL
        AND average_playtime_forever < 50000
    GROUP BY success_tier, tier_rank
    ORDER BY tier_rank
""").df()
display(q10)

# ── Q10 Chart: Playtime Retention Across Success Tiers ──────────────────────




# ── Prep ──────────────────────────────────────────────────────────────────────

q10_plot = q10.set_index('success_tier').reindex(tier_order).reset_index()
x = np.arange(4)
bar_colors = [TIER_COLORS[t] for t in tier_order]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5.5), facecolor=BG,
                                gridspec_kw={'width_ratios': [3, 2]})
fig.subplots_adjust(left=0.06, right=0.96, top=0.84, bottom=0.15, wspace=0.3)

fig.suptitle('Q10 · Do Successful Games Keep Players Longer?',
             fontsize=14, fontweight='700', color=TEXT, y=0.97)

# ── Left: Average vs Median playtime side by side ─────────────────────────────
clean_ax(ax1)
w = 0.3
avg_vals = q10_plot['avg_playtime_mins'].values
med_vals = q10_plot['median_playtime_mins'].values

bars_avg = ax1.bar(x - w/2, avg_vals, w, color=bar_colors, alpha=0.85, edgecolor=BG)
bars_med = ax1.bar(x + w/2, med_vals, w, color=bar_colors, alpha=0.4, edgecolor=BG)

for i in range(4):
    ax1.text(i - w/2, avg_vals[i] + max(avg_vals)*0.02,
             f'{int(avg_vals[i]):,}', ha='center', fontsize=9, fontweight='bold', color=TEXT)
    ax1.text(i + w/2, med_vals[i] + max(avg_vals)*0.02,
             f'{int(med_vals[i]):,}', ha='center', fontsize=9, color=SUBTEXT)

ax1.set_xticks(x); ax1.set_xticklabels(short_labels, fontsize=10)
ax1.set_ylabel('Minutes', fontsize=10, color=SUBTEXT)
ax1.set_title('Average vs Median playtime (minutes)', fontsize=12, fontweight='600', color=TEXT, pad=10)
ax1.grid(True, axis='y', alpha=0.15)

p1 = mpatches.Patch(color=TEXT, alpha=0.85, label='Average (solid)')
p2 = mpatches.Patch(color=TEXT, alpha=0.4, label='Median (faded)')
ax1.legend(handles=[p1, p2], fontsize=9, labelcolor=TEXT, frameon=True,
           facecolor=CARD, edgecolor=GRID)

# ── Right: Retention ratio as lollipop ────────────────────────────────────────
clean_ax(ax2)
retention = q10_plot['avg_retention_ratio'].values

for i, (r, color) in enumerate(zip(retention, bar_colors)):
    ax2.plot([i, i], [0, r], color=color, lw=4, alpha=0.6)
    ax2.scatter(i, r, color=color, s=150, zorder=5, edgecolor=BG, linewidth=2)
    ax2.text(i, r + 0.025, f'{r:.3f}', ha='center', fontsize=11,
             fontweight='bold', color=TEXT)

ax2.axhline(1.0, color=SUBTEXT, ls='--', lw=1, alpha=0.3, label='Perfect retention (1.0)')
ax2.set_xticks(range(4))
ax2.set_xticklabels(short_labels, fontsize=10)
ax2.set_ylabel('Median / Average playtime', fontsize=10, color=SUBTEXT)
ax2.set_ylim(0, max(retention) * 1.3)
ax2.set_title('Retention ratio\n(closer to 1.0 = more even engagement)', fontsize=12,
              fontweight='600', color=TEXT, pad=10)
ax2.legend(fontsize=9, labelcolor=SUBTEXT, frameon=True, facecolor=CARD, edgecolor=GRID)
ax2.grid(True, axis='y', alpha=0.15)

plt.show()

# Q11: Identifies which Steam tags (genres/themes) correlate with success tiers.
# Tags are stored in game_tags (one row per game × tag). Exported from PostgreSQL
# to CSV so the notebook is self-contained without Docker.

game_tags = pd.read_csv('../storage/game_tags.csv')
con.execute("CREATE OR REPLACE TABLE game_tags AS SELECT * FROM game_tags")

q11_top = con.execute("""
    SELECT t.tag_name, COUNT(*) AS num_games,
        SUM(CASE WHEN g.owners_estimate >= 1000000 THEN 1 ELSE 0 END) AS breakout_hits,
        SUM(CASE WHEN g.owners_estimate >= 200000 AND g.owners_estimate < 1000000 THEN 1 ELSE 0 END) AS strong_successes,
        SUM(CASE WHEN g.owners_estimate >= 20000  AND g.owners_estimate < 200000  THEN 1 ELSE 0 END) AS moderate_successes,
        SUM(CASE WHEN g.owners_estimate < 20000 THEN 1 ELSE 0 END) AS underperformers,
        ROUND(SUM(CASE WHEN g.owners_estimate >= 200000 THEN 1 ELSE 0 END)*100.0/COUNT(*),1) AS pct_strong_or_above,
        ROUND(AVG(g.peak_ccu),0) AS avg_peak_ccu
    FROM game_tags t JOIN games g ON t.app_id = g.app_id
    GROUP BY t.tag_name HAVING COUNT(*) >= 20
    ORDER BY pct_strong_or_above DESC LIMIT 20
""").df()

q11_bot = con.execute("""
    SELECT t.tag_name, COUNT(*) AS num_games,
        SUM(CASE WHEN g.owners_estimate < 20000 THEN 1 ELSE 0 END) AS underperformers,
        ROUND(SUM(CASE WHEN g.owners_estimate < 20000 THEN 1 ELSE 0 END)*100.0/COUNT(*),1) AS pct_underperformer
    FROM game_tags t JOIN games g ON t.app_id = g.app_id
    GROUP BY t.tag_name HAVING COUNT(*) >= 20
    ORDER BY pct_underperformer DESC LIMIT 10
""").df()

print("Top 20 tags by % reaching Strong Success or above:")
display(q11_top)
print("\nBottom 10 tags (highest underperformer %):")
display(q11_bot)

Top 20 tags by % reaching Strong Success or above:

Bottom 10 tags (highest underperformer %):

# ── Q11 Chart: Steam Tags Most Correlated with Success (Option 1) ────────────

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7), facecolor=BG)
fig.subplots_adjust(wspace=0.45, left=0.12, right=0.95, top=0.88, bottom=0.08)

fig.suptitle('Q11 · Which Steam Tags Predict (or Doom) Indie Success?',
             fontsize=14, fontweight='700', color=TEXT, y=0.97)

# ── Left panel: Top 10 tags by success ────────────────────────────────────────
top10 = q11_top.sort_values('pct_strong_or_above', ascending=True).tail(10)
pcts_t = top10['pct_strong_or_above'].values
tags_t = top10['tag_name'].values
n_t    = top10['num_games'].values

clean_ax(ax1, keep_left=False)
bars1 = ax1.barh(range(len(tags_t)), pcts_t, color=C_BREAKOUT, edgecolor=BG,
                 alpha=0.85, height=0.7)
for i, (pct, n) in enumerate(zip(pcts_t, n_t)):
    ax1.text(pct + 0.8, i, f'{pct:.1f}%  (n={int(n)})',
             va='center', fontsize=9, color=TEXT)
ax1.set_yticks(range(len(tags_t)))
ax1.set_yticklabels(tags_t, fontsize=10, color=TEXT)
ax1.set_xlabel('% reaching Strong Success or above (≥200k owners)', fontsize=9, color=SUBTEXT)
ax1.set_xlim(0, max(pcts_t) * 1.25)
ax1.set_title('Top 10 Tags — Success Magnets', fontsize=12, color=C_BREAKOUT, pad=12)
ax1.grid(True, axis='x', alpha=0.15, color=GRID)

# ── Right panel: Bottom 10 tags by underperformer rate ────────────────────────
bot10 = q11_bot.sort_values('pct_underperformer', ascending=True).tail(10)
pcts_b = bot10['pct_underperformer'].values
tags_b = bot10['tag_name'].values
n_b    = bot10['num_games'].values

clean_ax(ax2, keep_left=False)
bars2 = ax2.barh(range(len(tags_b)), pcts_b, color=C_UNDER, edgecolor=BG,
                 alpha=0.85, height=0.7)
for i, (pct, n) in enumerate(zip(pcts_b, n_b)):
    ax2.text(pct + 0.8, i, f'{pct:.1f}%  (n={int(n)})',
             va='center', fontsize=9, color=TEXT)
ax2.set_yticks(range(len(tags_b)))
ax2.set_yticklabels(tags_b, fontsize=10, color=TEXT)
ax2.set_xlabel('% classified as Underperformer (<20k owners)', fontsize=9, color=SUBTEXT)
ax2.set_xlim(0, max(pcts_b) * 1.25)
ax2.set_title('Bottom 10 Tags — Red Flags', fontsize=12, color=C_UNDER, pad=12)
ax2.grid(True, axis='x', alpha=0.15, color=GRID)

plt.show()

# Q12: Success tier composition by release year (2010–2025).
# Instead of a single success rate, shows how each year's cohort distributes
# across the four tiers — revealing whether the market has become harder to
# break out in as game counts increased (the "Discoverability Crisis").

q12 = con.execute("""
    WITH tiered AS (
    SELECT *,
        CASE
            WHEN owners_estimate >= 1000000 THEN 'Breakout Hit'
            WHEN owners_estimate >= 200000  THEN 'Strong Success'
            WHEN owners_estimate >= 20000   THEN 'Moderate Success'
            ELSE                                 'Underperformer'
        END AS success_tier,
        CASE
            WHEN owners_estimate >= 1000000 THEN 1
            WHEN owners_estimate >= 200000  THEN 2
            WHEN owners_estimate >= 20000   THEN 3
            ELSE                                 4
        END AS tier_rank
    FROM games
)
    SELECT TRY_CAST(SPLIT_PART(release_date, ', ', 2) AS INTEGER) AS release_year,
        COUNT(*) AS num_games,
        SUM(CASE WHEN success_tier='Breakout Hit'     THEN 1 ELSE 0 END) AS breakout_hits,
        SUM(CASE WHEN success_tier='Strong Success'   THEN 1 ELSE 0 END) AS strong_successes,
        SUM(CASE WHEN success_tier='Moderate Success' THEN 1 ELSE 0 END) AS moderate_successes,
        SUM(CASE WHEN success_tier='Underperformer'   THEN 1 ELSE 0 END) AS underperformers,
        ROUND(SUM(CASE WHEN success_tier='Breakout Hit'    THEN 1 ELSE 0 END)*100.0/COUNT(*),1) AS pct_breakout,
        ROUND(SUM(CASE WHEN success_tier='Underperformer'  THEN 1 ELSE 0 END)*100.0/COUNT(*),1) AS pct_underperformer,
        ROUND(AVG(peak_ccu), 0)          AS avg_peak_ccu,
        ROUND(AVG(owners_estimate), 0)   AS avg_owners
    FROM tiered
    WHERE TRY_CAST(SPLIT_PART(release_date, ', ', 2) AS INTEGER) BETWEEN 2010 AND 2025
    GROUP BY release_year ORDER BY release_year
""").df()
display(q12)

# ── Q12 Chart: Success Tier Composition by Release Year (v2) ─────────────────




# ── Prep ──────────────────────────────────────────────────────────────────────
q12_plot = q12.dropna(subset=['release_year']).copy()
q12_plot = q12_plot[q12_plot['release_year'].between(2010, 2025)]
q12_plot = q12_plot.sort_values('release_year')

years   = q12_plot['release_year'].astype(int).values
totals  = q12_plot['num_games'].values
pct_b   = q12_plot['breakout_hits'].values / totals * 100
pct_s   = q12_plot['strong_successes'].values / totals * 100
pct_m   = q12_plot['moderate_successes'].values / totals * 100
pct_u   = q12_plot['underperformers'].values / totals * 100

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 9), facecolor=BG,
                                gridspec_kw={'height_ratios': [3, 1]})
fig.subplots_adjust(hspace=0.25)

fig.suptitle('Q12 · The Discoverability Crisis: Has It Gotten Harder to Break Out?',
             fontsize=14, fontweight='700', color=TEXT, y=0.98)

# ── Top: stacked bar chart (%) ────────────────────────────────────────────────
clean_ax(ax1)
w = 0.7

ax1.bar(years, pct_b, w, color=C_BREAKOUT, alpha=0.85, edgecolor=BG, label='Breakout Hit')
ax1.bar(years, pct_s, w, bottom=pct_b, color=C_STRONG, alpha=0.85, edgecolor=BG, label='Strong Success')
ax1.bar(years, pct_m, w, bottom=pct_b+pct_s, color=C_MODERATE, alpha=0.85, edgecolor=BG, label='Moderate Success')
ax1.bar(years, pct_u, w, bottom=pct_b+pct_s+pct_m, color=C_UNDER, alpha=0.85, edgecolor=BG, label='Underperformer')

# Label breakout % on each bar (the key metric to watch shrink)
for i, (yr, pb) in enumerate(zip(years, pct_b)):
    if pb >= 5:
        ax1.text(yr, pb / 2, f'{pb:.0f}%', ha='center', va='center',
                 fontsize=8, fontweight='bold', color=BG)

ax1.set_ylabel('% of cohort', fontsize=11, color=SUBTEXT)
ax1.set_ylim(0, 100)
ax1.set_title('What proportion of each year\'s games reached each tier?',
              fontsize=12, fontweight='600', color=TEXT, pad=10)
ax1.legend(fontsize=9, labelcolor=TEXT, ncol=4, loc='upper center',
           bbox_to_anchor=(0.5, 1.15), frameon=True, facecolor=CARD, edgecolor=GRID)
ax1.grid(True, axis='y', alpha=0.15, color=GRID)
ax1.set_xticks(years)
ax1.set_xticklabels(years, rotation=45, fontsize=9)

# ── Bottom: absolute game count ───────────────────────────────────────────────
clean_ax(ax2)
ax2.bar(years, totals, color=SUBTEXT, alpha=0.5, edgecolor=BG, width=0.7)
for yr, n in zip(years, totals):
    if n >= 5:
        ax2.text(yr, n + max(totals)*0.03, str(int(n)),
                 ha='center', fontsize=8, color=SUBTEXT)
ax2.set_xlabel('Release year', fontsize=11, color=SUBTEXT)
ax2.set_ylabel('Games in\ndataset', fontsize=10, color=SUBTEXT)
ax2.set_xticks(years)
ax2.set_xticklabels(years, rotation=45, fontsize=9)
ax2.grid(True, axis='y', alpha=0.15, color=GRID)

plt.show()

from IPython.display import display, Markdown

with open('../run_pipeline.sh', 'r') as f:
    content = f.read()

display(Markdown("""
### `run_pipeline.sh` — End-to-End Pipeline Orchestration

The script automates the full pipeline in 6 steps:

| Step | Action | What it does |
|------|--------|-------------|
| 0 | Prerequisite checks | Validates Docker, Python, Java, `.env`, and data files are present |
| 1 | Start containers | `docker compose up -d` (PostgreSQL + MongoDB) |
| 2 | Wait for readiness | Polls both databases for up to 30s each |
| 3 | Load data | Runs `load_postgres.py` and `load_mongodb.py` |
| 4 | Spark pipeline | Joins, VADER sentiment, feature engineering, Parquet export |
| 5 | DuckDB queries | 12 analytical queries on Parquet files (Lakehouse pattern) |
| 6 | Data lineage | Logs all 22 pipeline steps to PostgreSQL |

**Reproducibility:** Any team member can clone the repo, run `./run_pipeline.sh`, and get identical results — Docker ensures consistent database environments, and Git tracks all code changes.
Raw Pipeline:
"""))

display(Markdown(f'```bash\n{content}\n```'))

#!/bin/bash
# =============================================================================
# run_pipeline.sh — Indie Game Survival Guide
# =============================================================================
# Orchestrates the full data engineering pipeline end to end.
# Run from the project root: ./run_pipeline.sh
#
# Prerequisites:
#   - Docker Desktop installed and running
#   - Python 3.x with packages from requirements.txt installed
#   - Java 17 installed (for PySpark)
#   - .env file with database credentials
#
# What this script does:
#   1. Starts Docker containers (PostgreSQL + MongoDB)
#   2. Waits for databases to be ready
#   3. Loads data into PostgreSQL and MongoDB
#   4. Runs Spark pipeline (joins, sentiment, features, Parquet output)
#   5. Runs DuckDB analytical queries on Parquet files
#   6. Logs data lineage into PostgreSQL
#   7. Prints summary
# =============================================================================

set -e  # Exit on any error

echo "============================================================"
echo "  Indie Game Pipeline — Full Automation Script"
echo "============================================================"
echo ""

# ── Step 0: Check prerequisites ──────────────────────────────────────────────

echo "[0/6] Checking prerequisites..."

# Check Docker
if ! docker info > /dev/null 2>&1; then
    echo "  ERROR: Docker is not running. Please start Docker Desktop."
    exit 1
fi
echo "  ✓ Docker running"

# Check Python
if ! python3 --version > /dev/null 2>&1; then
    echo "  ERROR: Python3 not found."
    exit 1
fi
echo "  ✓ Python3 found: $(python3 --version)"

# Check Java
if ! java -version > /dev/null 2>&1; then
    echo "  ERROR: Java not found. Install with: brew install openjdk@17"
    exit 1
fi
echo "  ✓ Java found"

# Check .env
if [ ! -f .env ]; then
    echo "  ERROR: .env file not found. Create it with database credentials."
    echo "  See DOCKER_SETUP.md for instructions."
    exit 1
fi
echo "  ✓ .env file found"

# Check data files
if [ ! -f data_collection/indie_games_filtered.json ]; then
    echo "  ERROR: data_collection/indie_games_filtered.json not found."
    echo "  Run the collection scripts first or pull data from the repo."
    exit 1
fi
echo "  ✓ Data files present"
echo ""

# ── Step 1: Start Docker containers ─────────────────────────────────────────

echo "[1/6] Starting Docker containers..."
docker compose up -d
echo "  ✓ Containers started"
echo ""

# ── Step 2: Wait for databases to be ready ───────────────────────────────────

echo "[2/6] Waiting for databases to be ready..."

# Wait for PostgreSQL
echo "  Waiting for PostgreSQL..."
for i in $(seq 1 30); do
    if docker exec indie_postgres pg_isready -U indie -d indie_games > /dev/null 2>&1; then
        echo "  ✓ PostgreSQL ready"
        break
    fi
    if [ $i -eq 30 ]; then
        echo "  ERROR: PostgreSQL did not start within 30 seconds"
        exit 1
    fi
    sleep 1
done

# Wait for MongoDB
echo "  Waiting for MongoDB..."
for i in $(seq 1 30); do
    if docker exec indie_mongo mongosh --quiet -u indie -p pipeline2026 --authenticationDatabase admin --eval "db.runCommand({ping:1})" > /dev/null 2>&1; then
        echo "  ✓ MongoDB ready"
        break
    fi
    if [ $i -eq 30 ]; then
        echo "  ERROR: MongoDB did not start within 30 seconds"
        exit 1
    fi
    sleep 1
done
echo ""

# ── Step 3: Load data into databases ────────────────────────────────────────

echo "[3/6] Loading data into databases..."
echo ""
echo "--- PostgreSQL ---"
python3 databases/load_postgres.py
echo ""
echo "--- MongoDB ---"
python3 databases/load_mongodb.py
echo ""

# ── Step 4: Run Spark pipeline ───────────────────────────────────────────────

echo "[4/6] Running Spark pipeline..."
echo ""
python3 spark_pipeline/spark_pipeline.py
echo ""

# ── Step 5: Run DuckDB queries ───────────────────────────────────────────────

echo "[5/6] Running DuckDB analytical queries..."
echo ""
python3 duckdb_queries.py
echo ""

# ── Step 6: Log data lineage ────────────────────────────────────────────────

echo "[6/6] Logging data lineage..."
echo ""
python3 data_lineage.py
echo ""

# ── Summary ──────────────────────────────────────────────────────────────────

echo "============================================================"
echo "  PIPELINE COMPLETE"
echo "============================================================"
echo ""
echo "  Databases:"
echo "    PostgreSQL: 11 data tables + 2 Spark output tables + lineage"
echo "    MongoDB:    2 collections (reviews + Reddit)"
echo ""
echo "  Parquet files:"
echo "    storage/master_dataset.parquet (1,216 games × 59 columns)"
echo "    storage/sentiment_scores.parquet"
echo "    storage/pre_launch_signals.parquet"
echo ""
echo "  Analysis:"
echo "    12 DuckDB queries executed"
echo "    storage/master_dataset.csv exported for notebook"
echo "    22 lineage entries logged"
echo "    storage/data_lineage_diagram.md (Mermaid diagram)"
echo ""
echo "  Next: Open notebooks/ and build visualisations"
echo "============================================================"

# Meeting minutes
display(Image('pipeline_notebook_images/meeting_minutes.png', width=1200))

# Trello board screenshots
# During project
display(Image('pipeline_notebook_images/trello_before.jpeg', width=1200))

# Project finalised screenshot
display(Image('pipeline_notebook_images/trello_after.jpeg', width=1200))

#	Source	Type	Auth	Records	Key Fields
1	Kaggle Steam Dataset	Flat file (JSON)	None	122,611 (filtered to 1,216)	app_id, name, price, peak_ccu, genres, tags
2	Steam Store API	REST API	None	1,006	descriptions, categories, platforms, metacritic
3	Steam Reviews API	REST API	None	57,938 reviews	review text, voted_up, playtime, author stats
4	SteamSpy API	REST API	None	1,216	ownership estimates, playtime, CCU
5	Reddit JSON endpoints	Web scraping	None	52,842 posts	post title, score, comments, subreddit info
6	IGDB via Twitch	REST API	OAuth2	1,150	ratings, platforms, genres, game modes
7	Google Trends	pytrends	None	1,216	weekly search interest 2019–2024
8	Steam Charts	Web scraping (BS4)	None	40,875 data points	monthly player counts

	app_id	name	release_date	price	dlc_count	short_description	windows	mac	linux	...	review_positivity_ratio	price_tier	steam_platform_count	num_supported_languages	playtime_retention_ratio	reddit_engagement	has_franchise	owners_estimate	tier_rank	success_tier
0	4000	Garry's Mod	Nov 29, 2006	4.99	0	Garry's Mod is a physics sandbox. There aren't...	True	True	True	...	0.968	under_5	3	31	0.135	237108.98	False	35000000.0	1	Breakout Hit
1	4920	Natural Selection 2	Oct 30, 2012	0.99	10	A fast paced multiplayer shooter that pits ali...	True	False	False	...	0.853	under_5	1	10	0.189	6617640.46	False	1500000.0	1	Breakout Hit
2	16900	GROUND BRANCH	Aug 14, 2018	25.49	0	From one of the developers behind the original...	True	False	False	...	0.893	15_to_30	1	0	0.415	858368.74	False	1500000.0	1	Breakout Hit
3	40700	Machinarium	Oct 16, 2009	4.99	1	Machinarium is the award-winning independent a...	True	True	False	...	0.950	under_5	2	0	0.761	775480.33	False	1500000.0	1	Breakout Hit
4	40800	Super Meat Boy	Nov 30, 2010	2.99	1	The infamous, tough-as-nails platformer comes ...	True	True	True	...	0.946	under_5	3	0	0.351	389284.71	False	3500000.0	1	Breakout Hit

	price_tier	total_games	breakout_hits	strong_successes	moderate_successes	underperformers	pct_breakout	avg_peak_ccu	avg_owners
0	free	157	63.0	62.0	20.0	12.0	40.1	2353.0	2920478.0
1	15_to_30	181	44.0	74.0	51.0	12.0	24.3	2613.0	1944116.0
2	under_5	352	134.0	126.0	65.0	27.0	38.1	1079.0	1708139.0
3	premium_30+	10	2.0	5.0	2.0	1.0	20.0	2685.0	1214500.0
4	5_to_15	516	125.0	223.0	117.0	51.0	24.2	850.0	1214205.0

	num_platforms	num_games	breakout_hits	strong_successes	moderate_successes	underperformers	avg_critic_score	avg_owners	avg_peak_ccu
0	0	8	3.0	4.0	1.0	0.0	NaN	1141875.0	236.0
1	1	421	79.0	173.0	120.0	49.0	79.5	996188.0	1030.0
2	2	122	28.0	48.0	32.0	14.0	79.1	1143852.0	1801.0
3	3	204	64.0	80.0	42.0	18.0	76.1	1488995.0	1141.0
4	4	91	25.0	44.0	17.0	5.0	74.2	1905769.0	1338.0
5	5	89	40.0	35.0	13.0	1.0	79.8	2632135.0	1192.0
6	6	90	42.0	42.0	5.0	1.0	79.9	2551889.0	1362.0
7	7	48	24.0	17.0	6.0	1.0	79.7	3025938.0	1572.0
8	8	41	24.0	14.0	3.0	0.0	81.0	2137805.0	754.0
9	9	20	8.0	10.0	2.0	0.0	82.2	1890000.0	698.0
10	10	7	4.0	2.0	0.0	1.0	83.4	7001429.0	6425.0
11	11	5	4.0	1.0	0.0	0.0	81.7	11470000.0	10459.0
12	12	2	1.0	0.0	1.0	0.0	82.0	1825000.0	251.0
13	15	1	1.0	0.0	0.0	0.0	81.0	35000000.0	24580.0
14	16	1	1.0	0.0	0.0	0.0	84.3	1500000.0	148.0

	success_tier	tier_rank	num_games	avg_vader_score	avg_pct_positive	avg_playtime_at_review_mins	avg_reviews_per_game	avg_owners
0	Breakout Hit	1	368	0.4290	83.2	7490.0	74.0	4769022.0
1	Strong Success	2	490	0.4147	81.2	5721.0	45.0	547551.0
2	Moderate Success	3	254	0.4525	81.2	4295.0	27.0	101811.0
3	Underperformer	4	100	0.4385	81.4	3950.0	17.0	10000.0

The Indie Game Survival Guide: What Makes an Indie Game Succeed on Steam?¶

1. Introduction to the Project¶

2. Pipeline Architecture¶

Storage justification¶

3. Data Sources¶

4. Schema Design¶

PostgreSQL (13 tables)¶

MongoDB (2 collections)¶

PostgreSQL (13 tables)¶

MongoDB (2 collections)¶

5. Spark Pipeline¶

6. DuckDB Analysis¶

7. Data Lineage¶

8. Automation¶

`run_pipeline.sh` — End-to-End Pipeline Orchestration¶

9. Key Findings¶

10. Conclusion¶

11. Final Deliverable¶

12. Limitations and Future Work¶

12.1 Limitations¶

12.2 Future Work¶

APPENDIX¶

Appendix A: Meeting Minutes¶

Appendix B: Project Management¶

Overview¶

Timeline¶

Individual Contributions¶

Key Challenges and How They Were Resolved¶

Appendix C: AI Usage Acknowledgement¶

Tool Used¶

How AI Was Used¶

What AI Did Not Do¶

Declaration¶

References¶

	success_tier	tier_rank	has_subreddit	num_games	avg_subscribers	avg_posts	avg_engagement	avg_owners
0	Breakout Hit	1	true	280	97599.0	49.0	1727092.9	5350000.0
1	Breakout Hit	1	false	88	NaN	47.0	1117135.4	2920455.0
2	Strong Success	2	true	303	8100.0	48.0	1169413.3	562541.0
3	Strong Success	2	false	187	NaN	43.0	1441523.4	523262.0
4	Moderate Success	3	true	118	2561.0	48.0	1048253.0	104449.0
5	Moderate Success	3	false	137	NaN	39.0	884266.4	99891.0
6	Underperformer	4	true	25	7829.0	45.0	724093.7	10000.0
7	Underperformer	4	false	78	NaN	38.0	1301657.3	10000.0

	success_tier	tier_rank	num_games	avg_pre_launch_interest	avg_post_launch_interest	avg_days_to_peak	rising_trends	falling_trends	avg_owners
0	Breakout Hit	1	185	12.44	28.84	162.0	148	20	4070270.0
1	Strong Success	2	324	8.02	26.67	6.0	244	45	537654.0
2	Moderate Success	3	146	8.00	25.22	-206.0	76	20	112671.0
3	Underperformer	4	49	9.61	27.38	-365.0	21	3	10000.0

	name	success_tier	peak_ccu	owners_estimate	price	review_ratio	vader_score	num_platforms	reddit_subs	trend_peak
0	Rust	Breakout Hit	143870	35000000.0	19.99	0.872	0.190	2	390673	100
1	Wallpaper Engine	Breakout Hit	91184	35000000.0	3.99	0.980	0.566	<NA>	212778	100
2	DELTARUNE	Breakout Hit	83936	1500000.0	19.99	0.989	0.651	6	470882	100
3	Stardew Valley	Breakout Hit	50662	35000000.0	8.99	0.984	0.812	11	3114769	100
4	Crosshair X	Breakout Hit	41428	3500000.0	3.49	0.934	0.471	<NA>	0	100
5	Don't Starve Together	Breakout Hit	30542	15000000.0	5.09	0.953	0.589	4	42503	100
6	Geometry Dash	Breakout Hit	27850	7500000.0	2.49	0.933	0.460	4	646126	100
7	Terraria	Breakout Hit	24580	35000000.0	4.99	0.975	0.504	15	1590045	100
8	Project Zomboid	Breakout Hit	22864	15000000.0	13.39	0.942	0.317	3	556701	100
9	Schedule I	Breakout Hit	22757	15000000.0	13.99	0.984	0.635	1	117	100
10	ARK: Survival Evolved	Breakout Hit	22170	35000000.0	9.89	0.838	0.460	10	27355	100
11	Rocket League®	Breakout Hit	21106	15000000.0	0.00	0.874	0.352	<NA>	1850853	0
12	RimWorld	Breakout Hit	19211	7500000.0	27.99	0.980	0.533	3	633623	100
13	Garry's Mod	Breakout Hit	18400	35000000.0	4.99	0.968	0.593	3	8547	100
14	Palworld	Breakout Hit	18028	75000000.0	22.49	0.941	0.676	5	755450	100
15	Euro Truck Simulator 2	Breakout Hit	17281	15000000.0	4.99	0.974	0.470	1	9	100
16	Balatro	Breakout Hit	17123	3500000.0	12.74	0.980	0.638	10	567722	100
17	7 Days to Die	Breakout Hit	17045	15000000.0	24.74	0.886	0.589	7	219740	100
18	Factorio	Breakout Hit	16426	7500000.0	35.00	0.970	0.539	5	472507	100
19	Slay the Spire	Breakout Hit	15038	7500000.0	2.49	0.978	0.660	3	282473	100

	price_tier	underperformer_games	avg_owners	avg_price	avg_peak_ccu	avg_review_ratio	avg_vader	avg_platforms	avg_reddit_posts	with_subreddit
0	5_to_15	51	10000.0	10.07	146.0	0.867	0.4507	2.1	41.0	11.0
1	under_5	27	10000.0	2.92	144.0	0.824	0.3922	1.7	36.0	4.0
2	free	12	10000.0	0.00	16575.0	0.680	0.4017	1.6	35.0	4.0
3	15_to_30	12	10000.0	17.86	138.0	0.818	0.5432	2.6	47.0	5.0
4	premium_30+	1	10000.0	35.99	67.0	0.685	0.1148	1.0	50.0	1.0

	success_tier	tier_rank	n	avg_owners	avg_price	avg_ccu	review_ratio	vader	platforms	reddit_posts	reddit_subs	trend_peak	steam_platforms	retention	achievements
0	Breakout Hit	1	368	4769022.0	7.25	3335.0	0.899	0.4290	4.1	48.0	97599.0	92.4	1.9	0.381	56.0
1	Strong Success	2	490	547551.0	8.83	383.0	0.871	0.4147	3.1	46.0	8100.0	85.3	1.7	0.496	65.0
2	Moderate Success	3	255	102000.0	9.77	237.0	0.851	0.4525	2.4	43.0	2561.0	62.0	1.5	0.858	49.0
3	Underperformer	4	103	10000.0	8.18	2058.0	0.834	0.4385	2.0	40.0	7829.0	48.5	1.6	1.019	41.0

	success_tier	tier_rank	num_games	avg_playtime_mins	median_playtime_mins	avg_retention_ratio	steamspy_avg_playtime	steamspy_median_playtime	avg_owners
0	Breakout Hit	1	365	2265.0	583.0	0.381	2314.0	590.0	4795890.0
1	Strong Success	2	485	1816.0	614.0	0.497	2045.0	652.0	547113.0
2	Moderate Success	3	245	1752.0	1493.0	0.860	1509.0	1284.0	102224.0
3	Underperformer	4	80	1457.0	1077.0	1.019	1655.0	1334.0	10000.0

	tag_name	num_games	breakout_hits	strong_successes	moderate_successes	underperformers	pct_strong_or_above	avg_peak_ccu
0	Mature	35	18.0	16.0	1.0	0.0	97.1	3269.0
1	4 Player Local	32	21.0	10.0	1.0	0.0	96.9	1500.0
2	Great Soundtrack	120	71.0	45.0	4.0	0.0	96.7	2198.0
3	Competitive	29	18.0	10.0	0.0	1.0	96.6	1197.0
4	Fighting	22	13.0	8.0	0.0	1.0	95.5	1018.0
5	Memes	44	24.0	17.0	2.0	1.0	93.2	2736.0
6	Fast-Paced	29	17.0	10.0	2.0	0.0	93.1	1252.0
7	Local Co-Op	84	42.0	35.0	6.0	1.0	91.7	841.0
8	Blood	21	10.0	9.0	2.0	0.0	90.5	556.0
9	Moddable	82	40.0	34.0	7.0	1.0	90.2	1832.0
10	Science	20	5.0	13.0	2.0	0.0	90.0	667.0
11	Survival Horror	75	42.0	25.0	6.0	2.0	89.3	1595.0
12	Free to Play	132	59.0	58.0	13.0	2.0	88.6	1102.0
13	Gore	87	47.0	30.0	10.0	0.0	88.5	681.0
14	Voxel	26	7.0	16.0	2.0	1.0	88.5	1143.0
15	Nudity	43	17.0	21.0	4.0	1.0	88.4	3910.0
16	Massively Multiplayer	76	35.0	32.0	8.0	1.0	88.2	1621.0
17	Parkour	25	8.0	14.0	2.0	1.0	88.0	295.0
18	War	66	24.0	34.0	8.0	0.0	87.9	1566.0
19	Violent	72	36.0	27.0	9.0	0.0	87.5	643.0

	tag_name	num_games	underperformers	pct_underperformer
0	Collectathon	23	7.0	30.4
1	Creature Collector	28	7.0	25.0
2	Idler	85	14.0	16.5
3	Farming Sim	49	8.0	16.3
4	Cozy	33	5.0	15.2
5	Time Management	55	8.0	14.5
6	Demons	21	3.0	14.3
7	Auto Battler	29	4.0	13.8
8	Score Attack	22	3.0	13.6
9	2D Platformer	45	6.0	13.3

	release_year	num_games	breakout_hits	strong_successes	moderate_successes	underperformers	pct_breakout	pct_underperformer	avg_peak_ccu	avg_owners
0	2010	2	2.0	0.0	0.0	0.0	100.0	0.0	66.0	3500000.0
1	2011	6	6.0	0.0	0.0	0.0	100.0	0.0	4233.0	8416667.0
2	2012	12	11.0	1.0	0.0	0.0	91.7	0.0	1895.0	4562500.0
3	2013	13	10.0	1.0	2.0	0.0	76.9	0.0	2435.0	6346154.0
4	2014	31	18.0	13.0	0.0	0.0	58.1	0.0	1374.0	2282258.0
5	2015	56	28.0	23.0	5.0	0.0	50.0	0.0	963.0	2153929.0
6	2016	53	31.0	17.0	4.0	1.0	58.5	1.9	2044.0	2928962.0
7	2017	56	26.0	21.0	8.0	1.0	46.4	1.8	1283.0	4357232.0
8	2018	87	37.0	43.0	6.0	1.0	42.5	1.1	3434.0	2657701.0
9	2019	68	30.0	31.0	4.0	3.0	44.1	4.4	839.0	1535368.0
10	2020	102	38.0	46.0	12.0	6.0	37.3	5.9	1242.0	1701765.0
11	2021	99	40.0	44.0	11.0	4.0	40.4	4.0	852.0	1439747.0
12	2022	97	25.0	51.0	15.0	6.0	25.8	6.2	631.0	1166804.0
13	2023	129	25.0	65.0	33.0	6.0	19.4	4.7	735.0	1029341.0
14	2024	197	23.0	92.0	64.0	18.0	11.7	9.1	1023.0	1287893.0
15	2025	204	16.0	40.0	91.0	57.0	7.8	27.9	1902.0	370392.0

Step	Action	What it does
0	Prerequisite checks	Validates Docker, Python, Java, `.env`, and data files are present
1	Start containers	`docker compose up -d` (PostgreSQL + MongoDB)
2	Wait for readiness	Polls both databases for up to 30s each
3	Load data	Runs `load_postgres.py` and `load_mongodb.py`
4	Spark pipeline	Joins, VADER sentiment, feature engineering, Parquet export
5	DuckDB queries	12 analytical queries on Parquet files (Lakehouse pattern)
6	Data lineage	Logs all 22 pipeline steps to PostgreSQL

Area	Description
Spark Pipeline	Claude Code assisted in writing and debugging `spark_pipeline.py`, including the join logic across 8 data sources, VADER sentiment integration, and feature engineering (success tier classification, playtime ratios, subreddit metrics).
DuckDB Queries	Claude Code helped write and refine the 12 analytical queries in `duckdb_queries.py`, particularly for multi-source joins, aggregation logic, and formatting results for visualisation.
Data Collection Scripts	Assisted in debugging API integrations (Steam, SteamSpy, IGDB OAuth2, Reddit, Google Trends) and handling edge cases (rate limiting, missing fields, data normalisation).
Notebook Visualisations	Claude Code assisted in generating matplotlib/seaborn charts for Q1–Q12 findings and writing the accompanying insight commentary.
Data Lineage	Helped design and implement the 22-step lineage tracking in `data_lineage.py`.
Documentation	Assisted in drafting sections of this notebook including architecture descriptions, schema explanations, and the key findings summary.

The Indie Game Survival Guide: What Makes an Indie Game Succeed on Steam?¶

1. Introduction to the Project¶

2. Pipeline Architecture¶

Storage justification¶

3. Data Sources¶

4. Schema Design¶

PostgreSQL (13 tables)¶

MongoDB (2 collections)¶

PostgreSQL (13 tables)¶

MongoDB (2 collections)¶

5. Spark Pipeline¶

6. DuckDB Analysis¶

7. Data Lineage¶

8. Automation¶

run_pipeline.sh — End-to-End Pipeline Orchestration¶

9. Key Findings¶

10. Conclusion¶

11. Final Deliverable¶

12. Limitations and Future Work¶

12.1 Limitations¶

12.2 Future Work¶

APPENDIX¶

Appendix A: Meeting Minutes¶

Appendix B: Project Management¶

Overview¶

Timeline¶

Individual Contributions¶

Key Challenges and How They Were Resolved¶

Appendix C: AI Usage Acknowledgement¶

Tool Used¶

How AI Was Used¶

What AI Did Not Do¶

Declaration¶

References¶

`run_pipeline.sh` — End-to-End Pipeline Orchestration¶