df_valid_salary = df.dropna(subset=["average_salary", "currency", "salary_type"]).copy()
df_valid_salary = df_valid_salary[df_valid_salary["salary_type"] != "unknown"]
df_valid_salary = df_valid_salary[df_valid_salary["salary_type"] != "annually"]
df_valid_salary = df_valid_salary[df_valid_salary["salary_type"] != "daily"]

currencies = df_valid_salary["currency"].unique()
salary_types = df_valid_salary["salary_type"].unique()

n_bins = int(2 * len(df) ** (1 / 3))

for curr in currencies:
    for sal_type in salary_types:
        subset = df_valid_salary[
            (df_valid_salary["currency"] == curr)
            & (df_valid_salary["salary_type"] == sal_type)
        ]

        if not subset.empty:
            color = "#636EFA" if curr == "USD" else "#FFA15A"

            fig = px.histogram(
                subset,
                x="average_salary",
                title=f"Salary Distribution: {curr} - {sal_type.capitalize()}",
                color_discrete_sequence=[color],
                nbins=n_bins,
                height=500,
            )

            fig.update_layout(
                xaxis_title=f"Average Salary ({curr})",
                yaxis_title="Number of Listings",
                template="plotly_white",
            )
            fig.show()

for curr in currencies:
    for sal_type in salary_types:
        subset = df_valid_salary[
            (df_valid_salary["currency"] == curr)
            & (df_valid_salary["salary_type"] == sal_type)
        ]

        if not subset.empty:
            fig = px.box(
                subset,
                x="work_type",
                y="average_salary",
                color="work_type",
                title=f"Pay Range by Work Type: {curr} - {sal_type.capitalize()}",
                height=600,
                points="outliers",  
            )

            fig.update_layout(
                template="plotly_white",
                xaxis_title="Work Type",
                yaxis_title=f"Average Salary ({curr})",
                legend_title="Work Type",
            )
            fig.show()

cat_df = df.dropna(subset=["category"])
cat_df = cat_df[cat_df["salary_type"] == "monthly"]

cat_group = (
    cat_df.groupby("category")
    .agg(avg_usd=("usd_salary", "mean"))
    .reset_index()
)

top10_cat = cat_group.nlargest(10, "avg_usd").sort_values("avg_usd", ascending=True)

fig_usd = go.Figure(
    go.Bar(
        y=top10_cat["category"],
        x=top10_cat["avg_usd"],
        name="Avg USD Salary",
        orientation="h",
        marker_color="steelblue",
        text=top10_cat["avg_usd"].map(lambda x: f"${x:,.0f}"),
        textposition="outside",
    )
)
fig_usd.update_layout(
    title="Average Monthly Pay by Category (Top 10) — USD",
    xaxis_title="Average Monthly Salary (USD)",
    yaxis_title="Category",
    height=500,
    margin=dict(l=200, r=150),
)
fig_usd.show()

sub_df = df.dropna(subset=["subcategory"])
sub_df = sub_df[
    (sub_df["salary_type"] == "monthly") & (sub_df["work_type"] == "Full Time")
]

sub_group = (
    sub_df.groupby("subcategory")
    .agg(avg_usd=("usd_salary", "mean"))
    .reset_index()
)

top10_sub = sub_group.nlargest(10, "avg_usd").sort_values("avg_usd", ascending=True)

fig_usd = go.Figure(
    go.Bar(
        y=top10_sub["subcategory"],
        x=top10_sub["avg_usd"],
        name="Avg USD Salary",
        orientation="h",
        marker_color="steelblue",
        text=top10_sub["avg_usd"].map(lambda x: f"${x:,.0f}"),
        textposition="outside",
    )
)
fig_usd.update_layout(
    title="Average Monthly Pay by Subcategory (Top 10) — USD",
    xaxis_title="Average Monthly Salary (USD)",
    yaxis_title="Subcategory",
    height=550,
    margin=dict(l=250, r=150),
)
fig_usd.show()

def parse_skills(series):
    return series.dropna().str.split(";").explode().str.strip().str.lower()

skill_counts = parse_skills(df["skills"]).value_counts().head(10).reset_index()
skill_counts.columns = ["Skill", "Count"]

fig_top_skills = px.bar(
    skill_counts,
    x="Count",
    y="Skill",
    orientation="h",
    title="Top 10 Most Required Skills",
    text="Count",
    color="Count",
    color_continuous_scale="Blues",
)

fig_top_skills.update_layout(
    yaxis=dict(autorange="reversed"),
    xaxis_title="Number of Job Listings",
    yaxis_title="Skill",
    coloraxis_showscale=False,
    template="plotly_white",
)
fig_top_skills.update_traces(textposition="outside")
fig_top_skills.show()

work_types = df["work_type"].dropna().unique()

for wt in work_types:
    subset = df[df["work_type"] == wt]
    skill_wt_counts = (
        parse_skills(subset["skills"]).value_counts().head(10).reset_index()
    )
    skill_wt_counts.columns = ["Skill", "Count"]

    if skill_wt_counts.empty:
        continue

    fig_wt = px.bar(
        skill_wt_counts,
        x="Count",
        y="Skill",
        orientation="h",
        title=f"Top 10 Skills — {wt}",
        text="Count",
        color="Count",
        color_continuous_scale="Oranges",
    )

    fig_wt.update_layout(
        yaxis=dict(autorange="reversed"),
        xaxis_title="Number of Job Listings",
        yaxis_title="Skill",
        coloraxis_showscale=False,
        template="plotly_white",
    )
    fig_wt.update_traces(textposition="outside")
    fig_wt.show()

TOP_N = 15  

top_skills = parse_skills(df["skills"]).value_counts().head(TOP_N).index.tolist()

pair_counts = Counter()

for skill_list in df["skills"].dropna():
    skills_in_row = list(
        {
            s.strip().lower()
            for s in skill_list.split(";")
            if s.strip().lower() in top_skills
        }
    )
    for pair in combinations(sorted(skills_in_row), 2):
        pair_counts[pair] += 1

matrix = pd.DataFrame(0, index=top_skills, columns=top_skills)

for (s1, s2), count in pair_counts.items():
    matrix.loc[s1, s2] = count
    matrix.loc[s2, s1] = count

fig_heatmap = go.Figure(
    data=go.Heatmap(
        z=matrix.values,
        x=matrix.columns.tolist(),
        y=matrix.index.tolist(),
        colorscale="Purples",
        text=matrix.values,
        texttemplate="%{text}",
        hoverongaps=False,
        hovertemplate="<b>%{y}</b> + <b>%{x}</b><br>Co-occurrences: %{z}<extra></extra>",
    )
)

fig_heatmap.update_layout(
    title=f"Skill Pairing Heatmap (Top {TOP_N} Skills)",
    xaxis_title="Skill",
    yaxis_title="Skill",
    template="plotly_white",
    height=650,
    xaxis=dict(tickangle=-35),
)

fig_heatmap.show()

skill_counts_50 = parse_skills(df["skills"]).value_counts().head(50).reset_index()
skill_counts_50.columns = ["Skill", "Count"]

fig_top50 = px.bar(
    skill_counts_50,
    x="Count",
    y="Skill",
    orientation="h",
    title="Top 50 Most Required Skills",
    text="Count",
    color="Count",
    color_continuous_scale="Blues",
    height=1400,
)

fig_top50.update_layout(
    yaxis=dict(autorange="reversed"),
    xaxis_title="Number of Job Listings",
    yaxis_title="Skill",
    coloraxis_showscale=False,
    template="plotly_white",
)
fig_top50.update_traces(textposition="outside")
fig_top50.show()

TOP_N_25 = 25

top_skills_25 = parse_skills(df["skills"]).value_counts().head(TOP_N_25).index.tolist()

pair_counts_25 = Counter()

for skill_list in df["skills"].dropna():
    skills_in_row = list(
        {
            s.strip().lower()
            for s in skill_list.split(";")
            if s.strip().lower() in top_skills_25
        }
    )
    for pair in combinations(sorted(skills_in_row), 2):
        pair_counts_25[pair] += 1

matrix_25 = pd.DataFrame(0, index=top_skills_25, columns=top_skills_25)

for (s1, s2), count in pair_counts_25.items():
    matrix_25.loc[s1, s2] = count
    matrix_25.loc[s2, s1] = count

fig_heatmap_25 = go.Figure(
    data=go.Heatmap(
        z=matrix_25.values,
        x=matrix_25.columns.tolist(),
        y=matrix_25.index.tolist(),
        colorscale="Purples",
        text=matrix_25.values,
        texttemplate="%{text}",
        hoverongaps=False,
        hovertemplate="<b>%{y}</b> + <b>%{x}</b><br>Co-occurrences: %{z}<extra></extra>",
    )
)

fig_heatmap_25.update_layout(
    title="Skill Pairing Heatmap (Top 25 Skills)",
    xaxis_title="Skill",
    yaxis_title="Skill",
    template="plotly_white",
    height=850,
    width=950,
    xaxis=dict(tickangle=-35),
)

fig_heatmap_25.show()

work_type_counts = df["work_type"].value_counts().reset_index()
work_type_counts.columns = ["Work Type", "Job Count"]

fig1 = px.pie(
    work_type_counts,
    names="Work Type",
    values="Job Count",
    title="Number of Jobs per Work Type",
    color_discrete_sequence=px.colors.qualitative.Pastel,
)

fig1.update_traces(textposition="inside", textinfo="percent+label")
fig1.update_layout(template="plotly_white")
fig1.show()

df["date_updated_parsed"] = pd.to_datetime(df["date_updated"], errors="coerce")
df["year_month"] = df["date_updated_parsed"].dt.to_period("M")

monthly_volume = (
    df.dropna(subset=["date_updated_parsed"])
    .groupby("year_month")
    .size()
    .reset_index(name="job_count")
    .sort_values("year_month")
)
monthly_volume["year_month_dt"] = monthly_volume["year_month"].dt.to_timestamp()

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=monthly_volume["year_month_dt"],
        y=monthly_volume["job_count"],
        mode="lines+markers+text",
        line=dict(color="#4C72B0", width=2.5),
        marker=dict(size=8, color="white", line=dict(color="#4C72B0", width=2.5)),
        text=monthly_volume["job_count"],
        textposition="top center",
        textfont=dict(size=11),
        fill="tozeroy",
        fillcolor="rgba(76, 114, 176, 0.12)",
        hovertemplate="<b>%{x|%B %Y}</b><br>Job Postings: %{y}<extra></extra>",
    )
)

fig.update_layout(
    title=dict(
        text="Job Posting Volume Over Time (Monthly)",
        font=dict(size=16),
        x=0.5,
        xanchor="center",
    ),
    xaxis=dict(
        title="Month", tickformat="%b %Y", tickangle=-45, showgrid=False, dtick="M1"
    ),
    yaxis=dict(title="Number of Job Postings", showgrid=True, gridcolor="#eeeeee"),
    plot_bgcolor="white",
    hovermode="x unified",
    margin=dict(t=60, b=80, l=60, r=40),
)

fig.show()

hours_clean = pd.to_numeric(df["hours_per_week"], errors="coerce").dropna()
hours_clean = hours_clean[hours_clean > 0]

bins = [0, 10, 20, 30, 40, 50, 60, 80, float("inf")]
labels = ["1-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-80", "80+"]
hours_binned = pd.cut(hours_clean, bins=bins, labels=labels)
hours_dist = (
    hours_binned.value_counts().reindex(labels).fillna(0).astype(int).reset_index()
)
hours_dist.columns = ["range", "count"]

colors = ["#E05A2B" if r == "31-40" else "#4C72B0" for r in hours_dist["range"]]

fig2 = go.Figure()

fig2.add_trace(
    go.Bar(
        x=hours_dist["range"],
        y=hours_dist["count"],
        marker_color=colors,
        marker_line_color="white",
        marker_line_width=1.5,
        text=hours_dist["count"],
        textposition="outside",
        hovertemplate="<b>%{x} hrs/week</b><br>Job Postings: %{y}<extra></extra>",
    )
)

fig2.update_layout(
    title=dict(
        text="Weekly Hours Distribution<br><sup>TBD & missing values excluded</sup>",
        font=dict(size=16),
        x=0.5,
        xanchor="center",
    ),
    xaxis=dict(title="Hours per Week", showgrid=False),
    yaxis=dict(title="Number of Job Postings", showgrid=True, gridcolor="#eeeeee"),
    plot_bgcolor="white",
    margin=dict(t=80, b=60, l=60, r=40),
    annotations=[
        dict(
            x=0.99,
            y=0.97,
            xref="paper",
            yref="paper",
            text="31-40 hrs = standard full-time",
            showarrow=False,
            font=dict(size=11, color="#E05A2B"),
            align="right",
        ),
        dict(
            x=0.99,
            y=0.90,
            xref="paper",
            yref="paper",
            text="🔵 Other ranges",
            showarrow=False,
            font=dict(size=11, color="#4C72B0"),
            align="right",
        ),
    ],
)

fig2.show()

print(f"Records with valid hours: {len(hours_clean):,} / {len(df):,} total")

Records with valid hours: 20,147 / 23,908 total

df["hours_per_week"] = pd.to_numeric(df["hours_per_week"], errors="coerce")
df = df.dropna(subset=["hours_per_week", "average_salary"])
df = df[
    df["hours_per_week"]
    == df["hours_per_week"].apply(lambda x: int(x) if pd.notna(x) else x)
]
df = df[df["salary_type"] == "monthly"]

work_types = df["work_type"].unique()
colors = {
    wt: clr
    for wt, clr in zip(
        work_types, ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
    )
}

for currency in ["USD", "PHP"]:
    subset = df[df["currency"] == currency]

    fig = go.Figure()

    for wt in work_types:
        wt_data = subset[subset["work_type"] == wt]
        fig.add_trace(
            go.Scatter(
                x=wt_data["hours_per_week"],
                y=wt_data["average_salary"],
                mode="markers",
                name=wt,
                marker=dict(size=9, color=colors.get(wt), opacity=0.75),
                hovertemplate=(
                    f"<b>{wt}</b><br>"
                    "Hours/Week: %{x}<br>"
                    f"Avg Salary ({currency}): %{{y:,.2f}}<br>"
                    "%{text}<extra></extra>"
                ),
                text=wt_data["title"].str.slice(0, 60),
            )
        )

    fig.update_layout(
        title=f"Pay vs Hours — {currency}",
        xaxis_title="Hours per Week",
        yaxis_title=f"Average Salary ({currency})",
        legend_title="Work Type",
        template="plotly_white",
        hovermode="closest",
        legend=dict(itemclick="toggle", itemdoubleclick="toggleothers"),
    )

    fig.show()

Total Jobs Scraped¶

Salary Landscape¶

Salary Distribution (USD and PHP across different salary types)¶

Analysis¶

Pay Range By work type (usd and php across different salary types)¶

Analysis¶

Average Pay By Category (Top 10)¶

Average Pay by SubCategory (Top 10) (both usd and php)¶

Market Demands¶

Top 10 Required Skills¶

Skills by Work Type¶

Skill Pairing Heatmap¶

Top 50 Required Skills¶

Skill Pairing Heatmap (Top 25 Skills)¶

Work Types & Trends¶

Break Down of how many jobs per work type¶

Over Time Posting Volume by date per month¶

Weekly Hours Distribution (hours pattern)¶

Pay Vs Hours¶

Analysis¶

END¶