Total Jobs Scraped¶
Salary Landscape¶
Salary Distribution (USD and PHP across different salary types)¶
df_valid_salary = df.dropna(subset=["average_salary", "currency", "salary_type"]).copy()
df_valid_salary = df_valid_salary[df_valid_salary["salary_type"] != "unknown"]
df_valid_salary = df_valid_salary[df_valid_salary["salary_type"] != "annually"]
df_valid_salary = df_valid_salary[df_valid_salary["salary_type"] != "daily"]
currencies = df_valid_salary["currency"].unique()
salary_types = df_valid_salary["salary_type"].unique()
n_bins = int(2 * len(df) ** (1 / 3))
for curr in currencies:
for sal_type in salary_types:
subset = df_valid_salary[
(df_valid_salary["currency"] == curr)
& (df_valid_salary["salary_type"] == sal_type)
]
if not subset.empty:
color = "#636EFA" if curr == "USD" else "#FFA15A"
fig = px.histogram(
subset,
x="average_salary",
title=f"Salary Distribution: {curr} - {sal_type.capitalize()}",
color_discrete_sequence=[color],
nbins=n_bins,
height=500,
)
fig.update_layout(
xaxis_title=f"Average Salary ({curr})",
yaxis_title="Number of Listings",
template="plotly_white",
)
fig.show()
Analysis¶
Disclaimer: all the analysis in this notebook was generated by gemini 3.1 pro by feeding the visualizations to it.
PHP Monthly: Heavily right-skewed with spikes at clean numbers — 20k, 30k, 40k, 50k. Most listings fall between 20k–60k PHP, with a long tail above 100k.
USD Monthly: Strong clustering at $500 and $1,000 , with other
spikes at $400, $800, and $1,500.
Pay Range By work type (usd and php across different salary types)¶
IQR box plots showing median, Q1–Q3, and outlier range
for curr in currencies:
for sal_type in salary_types:
subset = df_valid_salary[
(df_valid_salary["currency"] == curr)
& (df_valid_salary["salary_type"] == sal_type)
]
if not subset.empty:
fig = px.box(
subset,
x="work_type",
y="average_salary",
color="work_type",
title=f"Pay Range by Work Type: {curr} - {sal_type.capitalize()}",
height=600,
points="outliers",
)
fig.update_layout(
template="plotly_white",
xaxis_title="Work Type",
yaxis_title=f"Average Salary ({curr})",
legend_title="Work Type",
)
fig.show()
Analysis¶
PHP Monthly: Full-time leads with the highest median (~45k PHP) and widest range (30k–60k). Part-time and Any have lower medians (~35k and ~25k) but both show high-earning outliers up to 100k. Gig work sits around 20k; Freelance listings are sparse, clustering near 15k.
USD Monthly: Full-time leads with a median of ~$800 and a range of
$600–$1,200. Other categories (Part-time, Any, Gig) have lower medians
($450–$700), but all feature dense clusters of outliers reaching
$2,000 — matching the absolute peak of full-time earners.
Average Pay By Category (Top 10)¶
cat_df = df.dropna(subset=["category"])
cat_df = cat_df[cat_df["salary_type"] == "monthly"]
cat_group = (
cat_df.groupby("category")
.agg(avg_usd=("usd_salary", "mean"))
.reset_index()
)
top10_cat = cat_group.nlargest(10, "avg_usd").sort_values("avg_usd", ascending=True)
fig_usd = go.Figure(
go.Bar(
y=top10_cat["category"],
x=top10_cat["avg_usd"],
name="Avg USD Salary",
orientation="h",
marker_color="steelblue",
text=top10_cat["avg_usd"].map(lambda x: f"${x:,.0f}"),
textposition="outside",
)
)
fig_usd.update_layout(
title="Average Monthly Pay by Category (Top 10) — USD",
xaxis_title="Average Monthly Salary (USD)",
yaxis_title="Category",
height=500,
margin=dict(l=200, r=150),
)
fig_usd.show()
Average Pay by SubCategory (Top 10) (both usd and php)¶
sub_df = df.dropna(subset=["subcategory"])
sub_df = sub_df[
(sub_df["salary_type"] == "monthly") & (sub_df["work_type"] == "Full Time")
]
sub_group = (
sub_df.groupby("subcategory")
.agg(avg_usd=("usd_salary", "mean"))
.reset_index()
)
top10_sub = sub_group.nlargest(10, "avg_usd").sort_values("avg_usd", ascending=True)
fig_usd = go.Figure(
go.Bar(
y=top10_sub["subcategory"],
x=top10_sub["avg_usd"],
name="Avg USD Salary",
orientation="h",
marker_color="steelblue",
text=top10_sub["avg_usd"].map(lambda x: f"${x:,.0f}"),
textposition="outside",
)
)
fig_usd.update_layout(
title="Average Monthly Pay by Subcategory (Top 10) — USD",
xaxis_title="Average Monthly Salary (USD)",
yaxis_title="Subcategory",
height=550,
margin=dict(l=250, r=150),
)
fig_usd.show()
Market Demands¶
def parse_skills(series):
return series.dropna().str.split(";").explode().str.strip().str.lower()
Top 10 Required Skills¶
Skills listed most frequently across all job postings
skill_counts = parse_skills(df["skills"]).value_counts().head(10).reset_index()
skill_counts.columns = ["Skill", "Count"]
fig_top_skills = px.bar(
skill_counts,
x="Count",
y="Skill",
orientation="h",
title="Top 10 Most Required Skills",
text="Count",
color="Count",
color_continuous_scale="Blues",
)
fig_top_skills.update_layout(
yaxis=dict(autorange="reversed"),
xaxis_title="Number of Job Listings",
yaxis_title="Skill",
coloraxis_showscale=False,
template="plotly_white",
)
fig_top_skills.update_traces(textposition="outside")
fig_top_skills.show()
Skills by Work Type¶
Which skills dominate each work arrangement
work_types = df["work_type"].dropna().unique()
for wt in work_types:
subset = df[df["work_type"] == wt]
skill_wt_counts = (
parse_skills(subset["skills"]).value_counts().head(10).reset_index()
)
skill_wt_counts.columns = ["Skill", "Count"]
if skill_wt_counts.empty:
continue
fig_wt = px.bar(
skill_wt_counts,
x="Count",
y="Skill",
orientation="h",
title=f"Top 10 Skills — {wt}",
text="Count",
color="Count",
color_continuous_scale="Oranges",
)
fig_wt.update_layout(
yaxis=dict(autorange="reversed"),
xaxis_title="Number of Job Listings",
yaxis_title="Skill",
coloraxis_showscale=False,
template="plotly_white",
)
fig_wt.update_traces(textposition="outside")
fig_wt.show()
Skill Pairing Heatmap¶
How often top skills appear together in the same listing
TOP_N = 15
top_skills = parse_skills(df["skills"]).value_counts().head(TOP_N).index.tolist()
pair_counts = Counter()
for skill_list in df["skills"].dropna():
skills_in_row = list(
{
s.strip().lower()
for s in skill_list.split(";")
if s.strip().lower() in top_skills
}
)
for pair in combinations(sorted(skills_in_row), 2):
pair_counts[pair] += 1
matrix = pd.DataFrame(0, index=top_skills, columns=top_skills)
for (s1, s2), count in pair_counts.items():
matrix.loc[s1, s2] = count
matrix.loc[s2, s1] = count
fig_heatmap = go.Figure(
data=go.Heatmap(
z=matrix.values,
x=matrix.columns.tolist(),
y=matrix.index.tolist(),
colorscale="Purples",
text=matrix.values,
texttemplate="%{text}",
hoverongaps=False,
hovertemplate="<b>%{y}</b> + <b>%{x}</b><br>Co-occurrences: %{z}<extra></extra>",
)
)
fig_heatmap.update_layout(
title=f"Skill Pairing Heatmap (Top {TOP_N} Skills)",
xaxis_title="Skill",
yaxis_title="Skill",
template="plotly_white",
height=650,
xaxis=dict(tickangle=-35),
)
fig_heatmap.show()
Top 50 Required Skills¶
Skills listed most frequently across all job postings (regardless of work type)
skill_counts_50 = parse_skills(df["skills"]).value_counts().head(50).reset_index()
skill_counts_50.columns = ["Skill", "Count"]
fig_top50 = px.bar(
skill_counts_50,
x="Count",
y="Skill",
orientation="h",
title="Top 50 Most Required Skills",
text="Count",
color="Count",
color_continuous_scale="Blues",
height=1400,
)
fig_top50.update_layout(
yaxis=dict(autorange="reversed"),
xaxis_title="Number of Job Listings",
yaxis_title="Skill",
coloraxis_showscale=False,
template="plotly_white",
)
fig_top50.update_traces(textposition="outside")
fig_top50.show()
Skill Pairing Heatmap (Top 25 Skills)¶
How often top 25 skills appear together in the same listing
TOP_N_25 = 25
top_skills_25 = parse_skills(df["skills"]).value_counts().head(TOP_N_25).index.tolist()
pair_counts_25 = Counter()
for skill_list in df["skills"].dropna():
skills_in_row = list(
{
s.strip().lower()
for s in skill_list.split(";")
if s.strip().lower() in top_skills_25
}
)
for pair in combinations(sorted(skills_in_row), 2):
pair_counts_25[pair] += 1
matrix_25 = pd.DataFrame(0, index=top_skills_25, columns=top_skills_25)
for (s1, s2), count in pair_counts_25.items():
matrix_25.loc[s1, s2] = count
matrix_25.loc[s2, s1] = count
fig_heatmap_25 = go.Figure(
data=go.Heatmap(
z=matrix_25.values,
x=matrix_25.columns.tolist(),
y=matrix_25.index.tolist(),
colorscale="Purples",
text=matrix_25.values,
texttemplate="%{text}",
hoverongaps=False,
hovertemplate="<b>%{y}</b> + <b>%{x}</b><br>Co-occurrences: %{z}<extra></extra>",
)
)
fig_heatmap_25.update_layout(
title="Skill Pairing Heatmap (Top 25 Skills)",
xaxis_title="Skill",
yaxis_title="Skill",
template="plotly_white",
height=850,
width=950,
xaxis=dict(tickangle=-35),
)
fig_heatmap_25.show()
Work Types & Trends¶
Break Down of how many jobs per work type¶
work_type_counts = df["work_type"].value_counts().reset_index()
work_type_counts.columns = ["Work Type", "Job Count"]
fig1 = px.pie(
work_type_counts,
names="Work Type",
values="Job Count",
title="Number of Jobs per Work Type",
color_discrete_sequence=px.colors.qualitative.Pastel,
)
fig1.update_traces(textposition="inside", textinfo="percent+label")
fig1.update_layout(template="plotly_white")
fig1.show()
Over Time Posting Volume by date per month¶
df["date_updated_parsed"] = pd.to_datetime(df["date_updated"], errors="coerce")
df["year_month"] = df["date_updated_parsed"].dt.to_period("M")
monthly_volume = (
df.dropna(subset=["date_updated_parsed"])
.groupby("year_month")
.size()
.reset_index(name="job_count")
.sort_values("year_month")
)
monthly_volume["year_month_dt"] = monthly_volume["year_month"].dt.to_timestamp()
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=monthly_volume["year_month_dt"],
y=monthly_volume["job_count"],
mode="lines+markers+text",
line=dict(color="#4C72B0", width=2.5),
marker=dict(size=8, color="white", line=dict(color="#4C72B0", width=2.5)),
text=monthly_volume["job_count"],
textposition="top center",
textfont=dict(size=11),
fill="tozeroy",
fillcolor="rgba(76, 114, 176, 0.12)",
hovertemplate="<b>%{x|%B %Y}</b><br>Job Postings: %{y}<extra></extra>",
)
)
fig.update_layout(
title=dict(
text="Job Posting Volume Over Time (Monthly)",
font=dict(size=16),
x=0.5,
xanchor="center",
),
xaxis=dict(
title="Month", tickformat="%b %Y", tickangle=-45, showgrid=False, dtick="M1"
),
yaxis=dict(title="Number of Job Postings", showgrid=True, gridcolor="#eeeeee"),
plot_bgcolor="white",
hovermode="x unified",
margin=dict(t=60, b=80, l=60, r=40),
)
fig.show()
Weekly Hours Distribution (hours pattern)¶
hours_clean = pd.to_numeric(df["hours_per_week"], errors="coerce").dropna()
hours_clean = hours_clean[hours_clean > 0]
bins = [0, 10, 20, 30, 40, 50, 60, 80, float("inf")]
labels = ["1-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-80", "80+"]
hours_binned = pd.cut(hours_clean, bins=bins, labels=labels)
hours_dist = (
hours_binned.value_counts().reindex(labels).fillna(0).astype(int).reset_index()
)
hours_dist.columns = ["range", "count"]
colors = ["#E05A2B" if r == "31-40" else "#4C72B0" for r in hours_dist["range"]]
fig2 = go.Figure()
fig2.add_trace(
go.Bar(
x=hours_dist["range"],
y=hours_dist["count"],
marker_color=colors,
marker_line_color="white",
marker_line_width=1.5,
text=hours_dist["count"],
textposition="outside",
hovertemplate="<b>%{x} hrs/week</b><br>Job Postings: %{y}<extra></extra>",
)
)
fig2.update_layout(
title=dict(
text="Weekly Hours Distribution<br><sup>TBD & missing values excluded</sup>",
font=dict(size=16),
x=0.5,
xanchor="center",
),
xaxis=dict(title="Hours per Week", showgrid=False),
yaxis=dict(title="Number of Job Postings", showgrid=True, gridcolor="#eeeeee"),
plot_bgcolor="white",
margin=dict(t=80, b=60, l=60, r=40),
annotations=[
dict(
x=0.99,
y=0.97,
xref="paper",
yref="paper",
text="31-40 hrs = standard full-time",
showarrow=False,
font=dict(size=11, color="#E05A2B"),
align="right",
),
dict(
x=0.99,
y=0.90,
xref="paper",
yref="paper",
text="🔵 Other ranges",
showarrow=False,
font=dict(size=11, color="#4C72B0"),
align="right",
),
],
)
fig2.show()
print(f"Records with valid hours: {len(hours_clean):,} / {len(df):,} total")
Records with valid hours: 20,147 / 23,908 total
Pay Vs Hours¶
scatter plot for pay vs hours in USD and in PHP
df["hours_per_week"] = pd.to_numeric(df["hours_per_week"], errors="coerce")
df = df.dropna(subset=["hours_per_week", "average_salary"])
df = df[
df["hours_per_week"]
== df["hours_per_week"].apply(lambda x: int(x) if pd.notna(x) else x)
]
df = df[df["salary_type"] == "monthly"]
work_types = df["work_type"].unique()
colors = {
wt: clr
for wt, clr in zip(
work_types, ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
)
}
for currency in ["USD", "PHP"]:
subset = df[df["currency"] == currency]
fig = go.Figure()
for wt in work_types:
wt_data = subset[subset["work_type"] == wt]
fig.add_trace(
go.Scatter(
x=wt_data["hours_per_week"],
y=wt_data["average_salary"],
mode="markers",
name=wt,
marker=dict(size=9, color=colors.get(wt), opacity=0.75),
hovertemplate=(
f"<b>{wt}</b><br>"
"Hours/Week: %{x}<br>"
f"Avg Salary ({currency}): %{{y:,.2f}}<br>"
"%{text}<extra></extra>"
),
text=wt_data["title"].str.slice(0, 60),
)
)
fig.update_layout(
title=f"Pay vs Hours — {currency}",
xaxis_title="Hours per Week",
yaxis_title=f"Average Salary ({currency})",
legend_title="Work Type",
template="plotly_white",
hovermode="closest",
legend=dict(itemclick="toggle", itemdoubleclick="toggleothers"),
)
fig.show()
Analysis¶
Market Overview: Volume, Structure & Hours
Job Volume Over Time: Postings peaked in October 2025 (~7,000), dipped to a low of 4,153 in December , then rebounded strongly in January 2026 (6,200+).
Work Type Breakdown: Full-time dominates at 51.4%, followed by part-time (25.2%), flexible/Any (17.5%), and gig (5.8%).
Hours Distribution: The 31–40 hr/week bracket overwhelmingly leads (10,653 postings), with 11–20 hrs as a distant second (3,927). Roles outside these two ranges are uncommon.
Pay vs. Hours: Both USD and PHP scatter plots show dense clusters at 40 hrs (full-time) and 20 hrs (part-time). The wide vertical spread at 40 hours confirms that pay varies more by role and skill level than by hours worked. PHP part-time roles mostly fall under 40,000 PHP; full-time spans 20,000–100,000+ PHP.
Top Skills & Pairings: Video editing leads all skills (2,108 listings), and the entire top five is digital media-focused. Skill pairing heatmaps show three tight clusters: video production (video editing + social media video editing, 531 pairings), digital marketing (Facebook ads + Google ads, 294 pairings), and social media (management + content creation, 285 pairings).